mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
Merge remote-tracking branch 'origin/main' into vlad/hadron-jwt
This commit is contained in:
@@ -146,7 +146,9 @@ jobs:
|
||||
with:
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
build-args: |
|
||||
|
||||
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -634,7 +634,9 @@ jobs:
|
||||
DEBIAN_VERSION=bookworm
|
||||
secrets: |
|
||||
SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
@@ -747,7 +749,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
@@ -766,7 +770,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
|
||||
3
.github/workflows/pg-clients.yml
vendored
3
.github/workflows/pg-clients.yml
vendored
@@ -72,9 +72,10 @@ jobs:
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.8
|
||||
image: clickhouse/clickhouse-server:25.6
|
||||
env:
|
||||
CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }}
|
||||
PGSSLCERT: /tmp/postgresql.crt
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
|
||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -5225,7 +5225,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi_types",
|
||||
|
||||
@@ -103,7 +103,7 @@ RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \
|
||||
&& if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \
|
||||
export CARGO_FEATURES="rest_broker"; \
|
||||
fi \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \
|
||||
--features $CARGO_FEATURES \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
|
||||
@@ -299,6 +299,7 @@ WORKDIR /home/nonroot
|
||||
ENV RUSTC_VERSION=1.88.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG CARGO_AUDITABLE_VERSION=0.7.0
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
ARG CARGO_HAKARI_VERSION=0.9.36
|
||||
ARG CARGO_DENY_VERSION=0.18.2
|
||||
@@ -314,14 +315,16 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --locked --version "${RUSTFILT_VERSION}" && \
|
||||
cargo install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
|
||||
cargo install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
|
||||
cargo install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
|
||||
cargo install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
|
||||
cargo install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
|
||||
cargo install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \
|
||||
cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \
|
||||
cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \
|
||||
cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
|
||||
cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
|
||||
cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
|
||||
cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
|
||||
cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
|
||||
cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;
|
||||
SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
|
||||
SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
END AS lsn;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
(SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
SELECT
|
||||
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
|
||||
(SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
|
||||
(SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
|
||||
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
|
||||
(SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
|
||||
FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
|
||||
) AS logical_snapshots_bytes;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT current_setting('max_connections') as max_connections;
|
||||
SELECT pg_catalog.current_setting('max_connections') AS max_connections;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT datname database_name,
|
||||
age(datfrozenxid) frozen_xid_age
|
||||
FROM pg_database
|
||||
pg_catalog.age(datfrozenxid) frozen_xid_age
|
||||
FROM pg_catalog.pg_database
|
||||
ORDER BY frozen_xid_age DESC LIMIT 10;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT datname database_name,
|
||||
mxid_age(datminmxid) min_mxid_age
|
||||
FROM pg_database
|
||||
pg_catalog.mxid_age(datminmxid) min_mxid_age
|
||||
FROM pg_catalog.pg_database
|
||||
ORDER BY min_mxid_age DESC LIMIT 10;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
|
||||
SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
|
||||
SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total
|
||||
FROM pg_database
|
||||
SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
|
||||
FROM pg_catalog.pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2;
|
||||
|
||||
@@ -3,6 +3,6 @@
|
||||
-- minutes.
|
||||
|
||||
SELECT
|
||||
x::text as duration_seconds,
|
||||
x::pg_catalog.text AS duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) AS size
|
||||
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
|
||||
|
||||
@@ -3,6 +3,6 @@
|
||||
|
||||
SELECT
|
||||
x AS duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
|
||||
VALUES ('5m'), ('15m'), ('1h')
|
||||
) AS t (x);
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
|
||||
SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
FROM pg_replication_slots
|
||||
SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
|
||||
FROM pg_catalog.pg_replication_slots
|
||||
WHERE slot_type = 'logical';
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
|
||||
SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size,
|
||||
SELECT pg_catalog.pg_database_size(datname) AS db_size,
|
||||
deadlocks,
|
||||
tup_inserted AS inserted,
|
||||
tup_updated AS updated,
|
||||
tup_deleted AS deleted,
|
||||
datname
|
||||
FROM pg_stat_database
|
||||
FROM pg_catalog.pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
|
||||
@@ -3,4 +3,4 @@
|
||||
-- replay LSN may have advanced past the receive LSN we are using for the
|
||||
-- calculation.
|
||||
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
SELECT
|
||||
slot_name,
|
||||
pg_wal_lsn_diff(
|
||||
pg_catalog.pg_wal_lsn_diff(
|
||||
CASE
|
||||
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
|
||||
ELSE pg_current_wal_lsn()
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
|
||||
ELSE pg_catalog.pg_current_wal_lsn()
|
||||
END,
|
||||
restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
|
||||
FROM pg_catalog.pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
@@ -4,4 +4,4 @@ SELECT
|
||||
WHEN wal_status = 'lost' THEN 1
|
||||
ELSE 0
|
||||
END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
FROM pg_catalog.pg_replication_slots;
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb
|
||||
Author: Alexey Masterov <alexey.masterov@databricks.com>
|
||||
Date: Mon Jul 28 18:11:02 2025 +0200
|
||||
|
||||
Patch for pg_repack
|
||||
|
||||
diff --git a/regress/Makefile b/regress/Makefile
|
||||
index bf6edcb..89b4c7f 100644
|
||||
index bf6edcb..110e734 100644
|
||||
--- a/regress/Makefile
|
||||
+++ b/regress/Makefile
|
||||
@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
|
||||
@@ -7,18 +13,36 @@ index bf6edcb..89b4c7f 100644
|
||||
#
|
||||
|
||||
-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
|
||||
+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
|
||||
+REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum
|
||||
|
||||
USE_PGXS = 1 # use pgxs if not in contrib directory
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/expected/init-extension.out
|
||||
+++ b/regress/expected/init-extension.out
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..e7f2363
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/autovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..fc7978e
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/noautovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
|
||||
index 8d0a94e..63b68bf 100644
|
||||
--- a/regress/expected/nosuper.out
|
||||
@@ -50,14 +74,22 @@ index 8d0a94e..63b68bf 100644
|
||||
INFO: repacking table "public.tbl_cluster"
|
||||
ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block
|
||||
DETAIL: query was: RESET lock_timeout
|
||||
diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/sql/init-extension.sql
|
||||
+++ b/regress/sql/init-extension.sql
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..a8eda63
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/autovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..13d4836
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/noautovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
|
||||
index 072f0fa..dbe60f8 100644
|
||||
--- a/regress/sql/nosuper.sql
|
||||
|
||||
@@ -82,6 +82,15 @@ struct Cli {
|
||||
#[arg(long, default_value_t = 3081)]
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// Backwards-compatible --http-port for Hadron deployments. Functionally the
|
||||
/// same as --external-http-port.
|
||||
#[arg(
|
||||
long,
|
||||
conflicts_with = "external_http_port",
|
||||
conflicts_with = "internal_http_port"
|
||||
)]
|
||||
pub http_port: Option<u16>,
|
||||
|
||||
#[arg(short = 'D', long, value_name = "DATADIR")]
|
||||
pub pgdata: String,
|
||||
|
||||
@@ -181,6 +190,26 @@ impl Cli {
|
||||
}
|
||||
}
|
||||
|
||||
// Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port`
|
||||
// arg is used and acts the same as `--external-http-port`. The internal http port is defined
|
||||
// to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so
|
||||
// we need to be careful with the ports to choose.
|
||||
fn get_external_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli.http_port.unwrap_or(cli.external_http_port);
|
||||
}
|
||||
cli.external_http_port
|
||||
}
|
||||
fn get_internal_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli
|
||||
.http_port
|
||||
.map(|p| p + 1)
|
||||
.unwrap_or(cli.internal_http_port);
|
||||
}
|
||||
cli.internal_http_port
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -205,13 +234,18 @@ fn main() -> Result<()> {
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
installed_extensions::initialize_metrics();
|
||||
hadron_metrics::initialize_metrics();
|
||||
if cli.lakebase_mode {
|
||||
installed_extensions::initialize_metrics();
|
||||
hadron_metrics::initialize_metrics();
|
||||
}
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let config = get_config(&cli)?;
|
||||
|
||||
let external_http_port = get_external_http_port(&cli);
|
||||
let internal_http_port = get_internal_http_port(&cli);
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
@@ -220,8 +254,8 @@ fn main() -> Result<()> {
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
external_http_port,
|
||||
internal_http_port,
|
||||
remote_ext_base_url: cli.remote_ext_base_url.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
@@ -245,7 +279,7 @@ fn main() -> Result<()> {
|
||||
config,
|
||||
)?;
|
||||
|
||||
let exit_code = compute_node.run()?;
|
||||
let exit_code = compute_node.run().context("running compute node")?;
|
||||
|
||||
scenario.teardown();
|
||||
|
||||
|
||||
@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||
});
|
||||
|
||||
let query = "
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();";
|
||||
SET updated_at = pg_catalog.now();";
|
||||
|
||||
match client.simple_query(query).await {
|
||||
Result::Ok(result) => {
|
||||
|
||||
@@ -6,7 +6,8 @@ use compute_api::responses::{
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption,
|
||||
PageserverConnectionInfo, PageserverProtocol, PgIdent, Role,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
@@ -31,13 +32,17 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration,
|
||||
};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
use utils::pid_file;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::configurator::launch_configurator;
|
||||
use crate::disk_quota::set_disk_quota;
|
||||
@@ -191,6 +196,7 @@ pub struct ComputeState {
|
||||
pub startup_span: Option<tracing::span::Span>,
|
||||
|
||||
pub lfc_prewarm_state: LfcPrewarmState,
|
||||
pub lfc_prewarm_token: CancellationToken,
|
||||
pub lfc_offload_state: LfcOffloadState,
|
||||
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
@@ -216,6 +222,7 @@ impl ComputeState {
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
lfc_prewarm_token: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,7 +255,7 @@ pub struct ParsedSpec {
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
/// k8s dns name and port
|
||||
@@ -296,25 +303,47 @@ impl ParsedSpec {
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
|
||||
// Extract the options from the spec file that are needed to connect to
|
||||
// the storage system.
|
||||
//
|
||||
// For backwards-compatibility, the top-level fields in the spec file
|
||||
// may be empty. In that case, we need to dig them from the GUCs in the
|
||||
// cluster.settings field.
|
||||
let pageserver_connstr = spec
|
||||
.pageserver_connstring
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
// In compute specs generated by old control plane versions, the spec file might
|
||||
// be missing the `pageserver_connection_info` field. In that case, we need to dig
|
||||
// the pageserver connection info from the `pageserver_connstr` field instead, or
|
||||
// if that's missing too, from the GUC in the cluster.settings field.
|
||||
let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
|
||||
pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr(
|
||||
pageserver_connstr_field,
|
||||
spec.shard_stripe_size,
|
||||
)?);
|
||||
}
|
||||
}
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
|
||||
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
|
||||
{
|
||||
Some(ShardStripeSize(u32::from_str(&guc)?))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
pageserver_conninfo =
|
||||
Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?);
|
||||
}
|
||||
}
|
||||
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
|
||||
"pageserver connection information should be provided"
|
||||
))?;
|
||||
|
||||
// Similarly for safekeeper connection strings
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
@@ -329,22 +358,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.ok_or("tenant id should be provided")
|
||||
.map(|s| TenantId::from_str(&s))?
|
||||
.or(Err("invalid tenant id"))?
|
||||
.ok_or(anyhow::anyhow!("tenant id should be provided"))?;
|
||||
TenantId::from_str(&guc).context("invalid tenant id")?
|
||||
};
|
||||
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
|
||||
timeline_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.ok_or("timeline id should be provided")
|
||||
.map(|s| TimelineId::from_str(&s))?
|
||||
.or(Err("invalid timeline id"))?
|
||||
.ok_or(anyhow::anyhow!("timeline id should be provided"))?;
|
||||
TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
|
||||
};
|
||||
|
||||
let endpoint_storage_addr: Option<String> = spec
|
||||
@@ -358,7 +387,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
|
||||
let res = ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
pageserver_conninfo,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
@@ -368,7 +397,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
};
|
||||
|
||||
// Now check validity of the parsed specification
|
||||
res.validate()?;
|
||||
res.validate().map_err(anyhow::Error::msg)?;
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
@@ -413,6 +442,66 @@ struct StartVmMonitorResult {
|
||||
vm_monitor: Option<JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
/// This function creates roles that are used by Databricks.
|
||||
/// These roles are not needs to be botostrapped at PG Compute provisioning time.
|
||||
/// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository.
|
||||
pub(crate) fn create_databricks_roles() -> Vec<String> {
|
||||
let roles = vec![
|
||||
// Role for prometheus_stats_exporter
|
||||
Role {
|
||||
name: "databricks_monitor".to_string(),
|
||||
// This uses "local" connection and auth method for that is "trust", so no password is needed.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "IN ROLE pg_monitor".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore control plane
|
||||
Role {
|
||||
name: "databricks_control_plane".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "SUPERUSER".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore httpgateway.
|
||||
Role {
|
||||
name: "databricks_gateway".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
},
|
||||
];
|
||||
|
||||
roles
|
||||
.into_iter()
|
||||
.map(|role| {
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}')
|
||||
THEN
|
||||
CREATE ROLE {} {};
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
role.name,
|
||||
role.name.pg_quote(),
|
||||
role.to_pg_options(),
|
||||
);
|
||||
query
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Databricks-specific environment variables to be passed to the `postgres` sub-process.
|
||||
pub struct DatabricksEnvVars {
|
||||
/// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check
|
||||
@@ -421,14 +510,27 @@ pub struct DatabricksEnvVars {
|
||||
/// Hostname of the Databricks workspace URL this compute instance belongs to.
|
||||
/// Used by postgres to verify Databricks PAT tokens.
|
||||
pub workspace_host: String,
|
||||
|
||||
pub lakebase_mode: bool,
|
||||
}
|
||||
|
||||
impl DatabricksEnvVars {
|
||||
pub fn new(compute_spec: &ComputeSpec, compute_id: Option<&String>) -> Self {
|
||||
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
|
||||
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
|
||||
// Panics if compute_id is not set or not in the expected format.
|
||||
let endpoint_id = compute_id.unwrap().split('/').next().unwrap().to_string();
|
||||
pub fn new(
|
||||
compute_spec: &ComputeSpec,
|
||||
compute_id: Option<&String>,
|
||||
instance_id: Option<String>,
|
||||
lakebase_mode: bool,
|
||||
) -> Self {
|
||||
let endpoint_id = if let Some(instance_id) = instance_id {
|
||||
// Use instance_id as endpoint_id if it is set. This code path is for PuPr model.
|
||||
instance_id
|
||||
} else {
|
||||
// Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model.
|
||||
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
|
||||
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
|
||||
// Panics if compute_id is not set or not in the expected format.
|
||||
compute_id.unwrap().split('/').next().unwrap().to_string()
|
||||
};
|
||||
let workspace_host = compute_spec
|
||||
.databricks_settings
|
||||
.as_ref()
|
||||
@@ -437,6 +539,7 @@ impl DatabricksEnvVars {
|
||||
Self {
|
||||
endpoint_id,
|
||||
workspace_host,
|
||||
lakebase_mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -446,6 +549,10 @@ impl DatabricksEnvVars {
|
||||
|
||||
/// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`.
|
||||
pub fn to_env_var_list(self) -> Vec<(String, String)> {
|
||||
if !self.lakebase_mode {
|
||||
// In neon env, we don't need to pass down the env vars to postgres.
|
||||
return vec![];
|
||||
}
|
||||
vec![
|
||||
(
|
||||
Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(),
|
||||
@@ -482,7 +589,7 @@ impl ComputeNode {
|
||||
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
|
||||
// Unset them via connection string options before connecting to the database.
|
||||
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
|
||||
let options = match conn_conf.get_options() {
|
||||
// Allow the control plane to override any options set by the
|
||||
// compute
|
||||
@@ -495,7 +602,11 @@ impl ComputeNode {
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
if params.lakebase_mode {
|
||||
ComputeNode::set_spec(¶ms, &mut new_state, pspec);
|
||||
} else {
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ComputeNode {
|
||||
@@ -1093,7 +1204,14 @@ impl ComputeNode {
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let pgdata = &self.params.pgdata;
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
fs::create_dir(pgdata)?;
|
||||
if self.params.lakebase_mode {
|
||||
// Ignore creation errors if the directory already exists (e.g. mounting it ahead of time).
|
||||
// If it is something different then PG startup will error out anyway.
|
||||
let _ok = fs::create_dir(pgdata);
|
||||
} else {
|
||||
fs::create_dir(pgdata)?;
|
||||
}
|
||||
|
||||
fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
@@ -1105,12 +1223,10 @@ impl ComputeNode {
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let started = Instant::now();
|
||||
|
||||
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
@@ -1148,23 +1264,20 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
|
||||
0 | 1 => ShardIndex::unsharded(),
|
||||
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
|
||||
let shard0_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: spec.pageserver_conninfo.shard_count,
|
||||
};
|
||||
|
||||
let shard0_url = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Grpc)?
|
||||
.to_owned();
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_connstr,
|
||||
shard0_url,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
shard_index,
|
||||
shard0_index,
|
||||
spec.storage_auth_token.clone(),
|
||||
None, // NB: base backups use payload compression
|
||||
)
|
||||
@@ -1196,7 +1309,9 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
|
||||
/// when the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let shard0_connstr = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Libpq)?;
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
@@ -1283,10 +1398,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1448,6 +1560,41 @@ impl ComputeNode {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
fn sync_safekeepers_with_retries(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let max_retries = 5;
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = self.sync_safekeepers(storage_auth_token.clone());
|
||||
match &result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("sync_safekeepers succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Err(e) if attempts < max_retries => {
|
||||
tracing::info!(
|
||||
"sync_safekeepers failed, will retry (attempt {attempts}): {e:#}"
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::warn!(
|
||||
"sync_safekeepers still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
let backoff = exponential_backoff_duration(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
std::thread::sleep(backoff);
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip_all)]
|
||||
@@ -1483,7 +1630,7 @@ impl ComputeNode {
|
||||
lsn
|
||||
} else {
|
||||
info!("starting safekeepers syncing");
|
||||
self.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?
|
||||
};
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
@@ -1499,16 +1646,8 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(compute_state, lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
self.get_basebackup(compute_state, lsn)
|
||||
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
|
||||
|
||||
if let Some(settings) = databricks_settings {
|
||||
copy_tls_certificates(
|
||||
@@ -1572,7 +1711,7 @@ impl ComputeNode {
|
||||
// symlink doesn't affect anything.
|
||||
//
|
||||
// See https://github.com/neondatabase/autoscaling/issues/800
|
||||
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
|
||||
std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?;
|
||||
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
|
||||
|
||||
match spec.mode {
|
||||
@@ -1587,6 +1726,12 @@ impl ComputeNode {
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
if self.params.lakebase_mode {
|
||||
// We are running in Hadron mode. Disabling this prewarming step for now as it could run
|
||||
// into dblet port conflicts and also doesn't add much value with our current infra.
|
||||
info!("Skipping postgres prewarming in Hadron mode");
|
||||
return Ok(());
|
||||
}
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
@@ -1648,7 +1793,12 @@ impl ComputeNode {
|
||||
let databricks_env_vars = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let spec = &state.pspec.as_ref().unwrap().spec;
|
||||
DatabricksEnvVars::new(spec, Some(&self.params.compute_id))
|
||||
DatabricksEnvVars::new(
|
||||
spec,
|
||||
Some(&self.params.compute_id),
|
||||
self.params.instance_id.clone(),
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
};
|
||||
|
||||
info!(
|
||||
@@ -1775,7 +1925,7 @@ impl ComputeNode {
|
||||
|
||||
// It doesn't matter what were the options before, here we just want
|
||||
// to connect and create a new superuser role.
|
||||
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
|
||||
zenith_admin_conf.options(ZENITH_OPTIONS);
|
||||
|
||||
let mut client =
|
||||
@@ -1820,7 +1970,15 @@ impl ComputeNode {
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
|
||||
if self.params.lakebase_mode {
|
||||
// Set a 2-minute statement_timeout for the session applying config. The individual SQL statements
|
||||
// used in apply_spec_sql() should not take long (they are just creating users and installing
|
||||
// extensions). If any of them are stuck for an extended period of time it usually indicates a
|
||||
// pageserver connectivity problem and we should bail out.
|
||||
conf.options("-c statement_timeout=2min");
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let spec = Arc::new(
|
||||
@@ -2138,7 +2296,17 @@ impl ComputeNode {
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
_ => Path::new(&self.params.pgdata),
|
||||
// BEGIN HADRON
|
||||
// NB: Read core dump files from a fixed location outside of
|
||||
// the data directory since `compute_ctl` wipes the data directory
|
||||
// across container restarts.
|
||||
_ => {
|
||||
if self.params.lakebase_mode {
|
||||
Path::new("/databricks/logs/brickstore")
|
||||
} else {
|
||||
Path::new(&self.params.pgdata)
|
||||
}
|
||||
} // END HADRON
|
||||
};
|
||||
|
||||
// Collect core dump paths if any
|
||||
@@ -2212,13 +2380,13 @@ impl ComputeNode {
|
||||
let result = client
|
||||
.simple_query(
|
||||
"SELECT
|
||||
row_to_json(pg_stat_statements)
|
||||
pg_catalog.row_to_json(pss)
|
||||
FROM
|
||||
pg_stat_statements
|
||||
public.pg_stat_statements pss
|
||||
WHERE
|
||||
userid != 'cloud_admin'::regrole::oid
|
||||
pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
|
||||
ORDER BY
|
||||
(mean_exec_time + mean_plan_time) DESC
|
||||
(pss.mean_exec_time + pss.mean_plan_time) DESC
|
||||
LIMIT 100",
|
||||
)
|
||||
.await;
|
||||
@@ -2346,11 +2514,11 @@ LIMIT 100",
|
||||
|
||||
// check the role grants first - to gracefully handle read-replicas.
|
||||
let select = "SELECT privilege_type
|
||||
FROM pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
|
||||
JOIN pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename = $1
|
||||
AND nspname = $2";
|
||||
FROM pg_catalog.pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
|
||||
JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
|
||||
AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
|
||||
let rows = db_client
|
||||
.query(select, &[role_name, schema_name])
|
||||
.await
|
||||
@@ -2419,8 +2587,9 @@ LIMIT 100",
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {query}"))?;
|
||||
} else {
|
||||
let query =
|
||||
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
|
||||
let query = format!(
|
||||
"CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
|
||||
);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
.await
|
||||
@@ -2451,7 +2620,7 @@ LIMIT 100",
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2470,7 +2639,7 @@ LIMIT 100",
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2523,22 +2692,22 @@ LIMIT 100",
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
let old_pageserver_conninfo = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.pageserver_conninfo
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
let pageserver_conninfo = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
.pageserver_conninfo;
|
||||
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
@@ -2796,7 +2965,10 @@ mod tests {
|
||||
|
||||
match ParsedSpec::try_from(spec.clone()) {
|
||||
Ok(_p) => panic!("Failed to detect duplicate entry"),
|
||||
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
|
||||
Err(e) => assert!(
|
||||
e.to_string()
|
||||
.starts_with("duplicate entry in safekeeper_connstrings:")
|
||||
),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,8 @@ use http::StatusCode;
|
||||
use reqwest::Client;
|
||||
use std::mem::replace;
|
||||
use std::sync::Arc;
|
||||
use tokio::{io::AsyncReadExt, spawn};
|
||||
use tokio::{io::AsyncReadExt, select, spawn};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
#[derive(serde::Serialize, Default)]
|
||||
@@ -92,34 +93,35 @@ impl ComputeNode {
|
||||
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
|
||||
/// Has a failpoint "compute-prewarm"
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
let token: CancellationToken;
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
|
||||
let state = &mut self.state.lock().unwrap();
|
||||
token = state.lfc_prewarm_token.clone();
|
||||
if let LfcPrewarmState::Prewarming =
|
||||
replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
crate::metrics::LFC_PREWARMS.inc();
|
||||
|
||||
let cloned = self.clone();
|
||||
let this = self.clone();
|
||||
spawn(async move {
|
||||
let state = match cloned.prewarm_impl(from_endpoint).await {
|
||||
Ok(true) => LfcPrewarmState::Completed,
|
||||
Ok(false) => {
|
||||
info!(
|
||||
"skipping LFC prewarm because LFC state is not found in endpoint storage"
|
||||
);
|
||||
LfcPrewarmState::Skipped
|
||||
}
|
||||
let prewarm_state = match this.prewarm_impl(from_endpoint, token).await {
|
||||
Ok(state) => state,
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "could not prewarm LFC");
|
||||
LfcPrewarmState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
}
|
||||
let error = format!("{err:#}");
|
||||
LfcPrewarmState::Failed { error }
|
||||
}
|
||||
};
|
||||
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = state;
|
||||
let state = &mut this.state.lock().unwrap();
|
||||
if let LfcPrewarmState::Cancelled = prewarm_state {
|
||||
state.lfc_prewarm_token = CancellationToken::new();
|
||||
}
|
||||
state.lfc_prewarm_state = prewarm_state;
|
||||
});
|
||||
true
|
||||
}
|
||||
@@ -132,47 +134,70 @@ impl ComputeNode {
|
||||
|
||||
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
|
||||
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
async fn prewarm_impl(
|
||||
&self,
|
||||
from_endpoint: Option<String>,
|
||||
token: CancellationToken,
|
||||
) -> Result<LfcPrewarmState> {
|
||||
let EndpointStoragePair {
|
||||
url,
|
||||
token: storage_token,
|
||||
} = self.endpoint_storage_pair(from_endpoint)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fail::fail_point!("compute-prewarm", |_| {
|
||||
bail!("prewarm configured to fail because of a failpoint")
|
||||
});
|
||||
fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
|
||||
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
let res = request.send().await.context("querying endpoint storage")?;
|
||||
match res.status() {
|
||||
let request = Client::new().get(&url).bearer_auth(storage_token);
|
||||
let response = select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
response = request.send() => response
|
||||
}
|
||||
.context("querying endpoint storage")?;
|
||||
|
||||
match response.status() {
|
||||
StatusCode::OK => (),
|
||||
StatusCode::NOT_FOUND => {
|
||||
return Ok(false);
|
||||
}
|
||||
StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
|
||||
status => bail!("{status} querying endpoint storage"),
|
||||
}
|
||||
|
||||
let mut uncompressed = Vec::new();
|
||||
let lfc_state = res
|
||||
.bytes()
|
||||
.await
|
||||
.context("getting request body from endpoint storage")?;
|
||||
ZstdDecoder::new(lfc_state.iter().as_slice())
|
||||
.read_to_end(&mut uncompressed)
|
||||
.await
|
||||
.context("decoding LFC state")?;
|
||||
let lfc_state = select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
lfc_state = response.bytes() => lfc_state
|
||||
}
|
||||
.context("getting request body from endpoint storage")?;
|
||||
|
||||
let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice());
|
||||
select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
read = decoder.read_to_end(&mut uncompressed) => read
|
||||
}
|
||||
.context("decoding LFC state")?;
|
||||
|
||||
let uncompressed_len = uncompressed.len();
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
|
||||
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
|
||||
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
// Client connection and prewarm info querying are fast and therefore don't need
|
||||
// cancellation
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())?;
|
||||
.context("connecting to postgres")?;
|
||||
let pg_token = client.cancel_token();
|
||||
|
||||
Ok(true)
|
||||
let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed];
|
||||
select! {
|
||||
res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res,
|
||||
_ = token.cancelled() => {
|
||||
pg_token.cancel_query(postgres::NoTls).await
|
||||
.context("cancelling neon.prewarm_local_cache()")?;
|
||||
return Ok(LfcPrewarmState::Cancelled)
|
||||
}
|
||||
}
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())?;
|
||||
|
||||
Ok(LfcPrewarmState::Completed)
|
||||
}
|
||||
|
||||
/// If offload request is ongoing, return false, true otherwise
|
||||
@@ -200,20 +225,20 @@ impl ComputeNode {
|
||||
|
||||
async fn offload_lfc_with_state_update(&self) {
|
||||
crate::metrics::LFC_OFFLOADS.inc();
|
||||
|
||||
let Err(err) = self.offload_lfc_impl().await else {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
let state = match self.offload_lfc_impl().await {
|
||||
Ok(state) => state,
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "could not offload LFC");
|
||||
let error = format!("{err:#}");
|
||||
LfcOffloadState::Failed { error }
|
||||
}
|
||||
};
|
||||
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "could not offload LFC state to endpoint storage");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
};
|
||||
self.state.lock().unwrap().lfc_offload_state = state;
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
async fn offload_lfc_impl(&self) -> Result<LfcOffloadState> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from Postgres");
|
||||
|
||||
@@ -228,7 +253,7 @@ impl ComputeNode {
|
||||
.context("deserializing LFC state")?;
|
||||
let Some(state) = state else {
|
||||
info!(%url, "empty LFC state, not exporting");
|
||||
return Ok(());
|
||||
return Ok(LfcOffloadState::Skipped);
|
||||
};
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
@@ -242,7 +267,7 @@ impl ComputeNode {
|
||||
|
||||
let request = Client::new().put(url).bearer_auth(token).body(compressed);
|
||||
match request.send().await {
|
||||
Ok(res) if res.status() == StatusCode::OK => Ok(()),
|
||||
Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
|
||||
Ok(res) => bail!(
|
||||
"Request to endpoint storage failed with status: {}",
|
||||
res.status()
|
||||
@@ -250,4 +275,8 @@ impl ComputeNode {
|
||||
Err(err) => Err(err).context("writing to endpoint storage"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cancel_prewarm(self: &Arc<Self>) {
|
||||
self.state.lock().unwrap().lfc_prewarm_token.cancel();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ impl ComputeNode {
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
@@ -103,7 +103,7 @@ impl ComputeNode {
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.query("SELECT pg_catalog.pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
|
||||
@@ -113,7 +113,7 @@ impl ComputeNode {
|
||||
});
|
||||
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
|
||||
@@ -18,6 +18,8 @@ use crate::pg_helpers::{
|
||||
};
|
||||
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
|
||||
|
||||
use utils::shard::{ShardIndex, ShardNumber};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
@@ -69,9 +71,75 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
writeln!(file)?;
|
||||
if let Some(conninfo) = &spec.pageserver_connection_info {
|
||||
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
let num_shards = if conninfo.shard_count.0 == 0 {
|
||||
1 // unsharded, treat it as a single shard
|
||||
} else {
|
||||
conninfo.shard_count.0
|
||||
};
|
||||
|
||||
for shard_number in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: conninfo.shard_count,
|
||||
};
|
||||
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"shard {shard_index} missing from pageserver_connection_info shard map"
|
||||
)
|
||||
})?;
|
||||
|
||||
let first_pageserver = info
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("must have at least one pageserver");
|
||||
|
||||
// Add the libpq URL to the array, or if the URL is missing, reset the array
|
||||
// forgetting any previous entries. All servers must have a libpq URL, or none
|
||||
// at all.
|
||||
if let Some(url) = &first_pageserver.libpq_url {
|
||||
if let Some(ref mut urls) = libpq_urls {
|
||||
urls.push(url.clone());
|
||||
}
|
||||
} else {
|
||||
libpq_urls = None
|
||||
}
|
||||
}
|
||||
if let Some(libpq_urls) = libpq_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"# derived from compute spec's pageserver_conninfo field"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring={}",
|
||||
escape_conf_value(&libpq_urls.join(","))
|
||||
)?;
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_connstring")?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = conninfo.stripe_size {
|
||||
writeln!(
|
||||
file,
|
||||
"# from compute spec's pageserver_conninfo.stripe_size field"
|
||||
)?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
} else {
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "# from compute spec's pageserver_connstring field")?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "# from compute spec's shard_stripe_size field")?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
}
|
||||
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
|
||||
@@ -122,8 +122,11 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
// into the type system.
|
||||
assert_eq!(state.status, ComputeStatus::RefreshConfiguration);
|
||||
|
||||
if state.pspec.as_ref().map(|ps| ps.pageserver_connstr.clone())
|
||||
== Some(pspec.pageserver_connstr.clone())
|
||||
if state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|ps| ps.pageserver_conninfo.clone())
|
||||
== Some(pspec.pageserver_conninfo.clone())
|
||||
{
|
||||
info!(
|
||||
"Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration."
|
||||
|
||||
@@ -139,6 +139,15 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
delete:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Cancel ongoing LFC prewarm
|
||||
description: ""
|
||||
operationId: cancelLfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: Prewarm cancelled
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
@@ -636,7 +645,7 @@ components:
|
||||
properties:
|
||||
status:
|
||||
description: LFC offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
enum: [not_offloaded, offloading, completed, skipped, failed]
|
||||
type: string
|
||||
error:
|
||||
description: LFC offload error, if any
|
||||
|
||||
@@ -46,3 +46,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode {
|
||||
compute.cancel_prewarm();
|
||||
StatusCode::ACCEPTED
|
||||
}
|
||||
|
||||
@@ -99,7 +99,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
);
|
||||
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route(
|
||||
"/lfc/prewarm",
|
||||
get(lfc::prewarm_state)
|
||||
.post(lfc::prewarm)
|
||||
.delete(lfc::cancel_prewarm),
|
||||
)
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
|
||||
@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
|
||||
.query(
|
||||
"SELECT datname FROM pg_catalog.pg_database
|
||||
WHERE datallowconn
|
||||
AND datconnlimit <> - 2
|
||||
AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
|
||||
LIMIT 500",
|
||||
&[],
|
||||
)
|
||||
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
|
||||
|
||||
let extensions: Vec<(String, String, i32)> = client
|
||||
.query(
|
||||
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
|
||||
"SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
|
||||
&[],
|
||||
)
|
||||
.await?
|
||||
|
||||
@@ -4,14 +4,13 @@ use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use itertools::Itertools as _;
|
||||
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let (connstrings, auth) = {
|
||||
let (conninfo, auth) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
(
|
||||
spec.pageserver_connstr.clone(),
|
||||
spec.pageserver_conninfo.clone(),
|
||||
spec.storage_auth_token.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let result =
|
||||
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
@@ -112,35 +110,44 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
/// Tries to acquire LSN leases on all Pageserver shards.
|
||||
fn try_acquire_lsn_lease(
|
||||
connstrings: &str,
|
||||
conninfo: PageserverConnectionInfo,
|
||||
auth: Option<&str>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let connstrings = connstrings.split(',').collect_vec();
|
||||
let shard_count = connstrings.len();
|
||||
let mut leases = Vec::new();
|
||||
|
||||
for (shard_number, &connstring) in connstrings.iter().enumerate() {
|
||||
let tenant_shard_id = match shard_count {
|
||||
0 | 1 => TenantShardId::unsharded(tenant_id),
|
||||
shard_count => TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
},
|
||||
for (shard_index, shard) in conninfo.shards.into_iter() {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: shard_index.shard_number,
|
||||
shard_count: shard_index.shard_count,
|
||||
};
|
||||
|
||||
let lease = match PageserverProtocol::from_connstring(connstring)? {
|
||||
PageserverProtocol::Libpq => {
|
||||
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
PageserverProtocol::Grpc => {
|
||||
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
};
|
||||
leases.push(lease);
|
||||
// XXX: If there are more than pageserver for the one shard, do we need to get a
|
||||
// leas on all of them? Currently, that's what we assume, but this is hypothetical
|
||||
// as of this writing, as we never pass the info for more than one pageserver per
|
||||
// shard.
|
||||
for pageserver in shard.pageservers {
|
||||
let lease = match conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
|
||||
&pageserver.grpc_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
|
||||
&pageserver.libpq_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
};
|
||||
leases.push(lease);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(leases.into_iter().min().flatten())
|
||||
|
||||
@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
|
||||
self.client
|
||||
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
|
||||
.await?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
|
||||
self.client
|
||||
.simple_query(
|
||||
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
|
||||
|
||||
@@ -15,17 +15,17 @@ DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -5,9 +5,9 @@ DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
|
||||
END IF;
|
||||
|
||||
@@ -2,7 +2,7 @@ DO $$
|
||||
DECLARE
|
||||
bypassrls boolean;
|
||||
BEGIN
|
||||
SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
|
||||
SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
|
||||
IF NOT bypassrls THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
|
||||
END IF;
|
||||
|
||||
@@ -4,8 +4,8 @@ DECLARE
|
||||
BEGIN
|
||||
FOR role IN
|
||||
SELECT rolname AS name, rolinherit AS inherit
|
||||
FROM pg_roles
|
||||
WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
IF NOT role.inherit THEN
|
||||
RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
|
||||
@@ -14,12 +14,12 @@ BEGIN
|
||||
|
||||
FOR role IN
|
||||
SELECT rolname AS name, rolbypassrls AS bypassrls
|
||||
FROM pg_roles
|
||||
WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
AND NOT starts_with(rolname, 'pg_')
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
AND NOT pg_catalog.starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
IF role.bypassrls THEN
|
||||
RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name);
|
||||
RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name);
|
||||
END IF;
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
|
||||
IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
|
||||
IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -2,12 +2,12 @@ DO $$
|
||||
DECLARE
|
||||
monitor record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
|
||||
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
FROM pg_catalog.pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::pg_catalog.regrole
|
||||
AND member = 'neon_superuser'::pg_catalog.regrole;
|
||||
|
||||
IF monitor IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
|
||||
|
||||
@@ -2,11 +2,11 @@ DO $$
|
||||
DECLARE
|
||||
can_execute boolean;
|
||||
BEGIN
|
||||
SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
|
||||
SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
|
||||
INTO can_execute
|
||||
FROM pg_proc
|
||||
FROM pg_catalog.pg_proc
|
||||
WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
|
||||
AND pronamespace = 'pg_catalog'::regnamespace;
|
||||
AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
|
||||
IF NOT can_execute THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
|
||||
END IF;
|
||||
|
||||
@@ -2,9 +2,9 @@ DO $$
|
||||
DECLARE
|
||||
can_execute boolean;
|
||||
BEGIN
|
||||
SELECT has_function_privilege('neon_superuser', oid, 'execute')
|
||||
SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
|
||||
INTO can_execute
|
||||
FROM pg_proc
|
||||
FROM pg_catalog.pg_proc
|
||||
WHERE proname = 'pg_show_replication_origin_status'
|
||||
AND pronamespace = 'pg_catalog'::regnamespace;
|
||||
IF NOT can_execute THEN
|
||||
|
||||
@@ -2,10 +2,10 @@ DO $$
|
||||
DECLARE
|
||||
signal_backend record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO signal_backend
|
||||
FROM pg_auth_members
|
||||
FROM pg_catalog.pg_auth_members
|
||||
WHERE roleid = 'pg_signal_backend'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
|
||||
@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
|
||||
// like `postgres_exporter` use it to query Postgres statistics.
|
||||
// Use explicit 8 bytes type casts to match Rust types.
|
||||
let stats = cli.query_one(
|
||||
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
|
||||
coalesce(sum(sessions), 0)::bigint AS total_sessions
|
||||
FROM pg_stat_database
|
||||
"SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
|
||||
pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
|
||||
FROM pg_catalog.pg_stat_database
|
||||
WHERE datname NOT IN (
|
||||
'postgres',
|
||||
'template0',
|
||||
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
|
||||
let mut last_active: Option<DateTime<Utc>> = None;
|
||||
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
||||
let backends = cli.query(
|
||||
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
||||
"SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
|
||||
FROM pg_stat_activity
|
||||
WHERE backend_type = 'client backend'
|
||||
AND pid != pg_backend_pid()
|
||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||
WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
|
||||
AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
|
||||
AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
|
||||
|
||||
@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
|
||||
.query_raw::<str, &String, &[String; 0]>(
|
||||
"SELECT
|
||||
datname AS name,
|
||||
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
|
||||
(SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
|
||||
FROM
|
||||
pg_catalog.pg_database;",
|
||||
&[],
|
||||
|
||||
@@ -13,17 +13,19 @@ use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles};
|
||||
use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS;
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
|
||||
AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles,
|
||||
CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions,
|
||||
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
|
||||
@@ -80,7 +82,7 @@ impl ComputeNode {
|
||||
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
|
||||
|
||||
drop_subscriptions_done = match
|
||||
client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
|
||||
client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
|
||||
Ok(result) => !result.is_empty(),
|
||||
Err(e) =>
|
||||
{
|
||||
@@ -166,6 +168,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[DropLogicalSubscriptions].to_vec(),
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -186,15 +189,33 @@ impl ComputeNode {
|
||||
};
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreatePrivilegedRole,
|
||||
let phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
] {
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
]
|
||||
};
|
||||
|
||||
for phase in phases {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
@@ -203,6 +224,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -254,6 +276,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
phases,
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -265,12 +288,28 @@ impl ComputeNode {
|
||||
handle.await??;
|
||||
}
|
||||
|
||||
let mut phases = vec![
|
||||
let mut phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
];
|
||||
]
|
||||
};
|
||||
|
||||
// This step depends on CreateSchemaNeon
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
@@ -303,6 +342,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -328,6 +368,7 @@ impl ComputeNode {
|
||||
concurrency_token: Arc<tokio::sync::Semaphore>,
|
||||
db: DB,
|
||||
subphases: Vec<PerDatabasePhase>,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()> {
|
||||
let _permit = concurrency_token.acquire().await?;
|
||||
|
||||
@@ -355,6 +396,7 @@ impl ComputeNode {
|
||||
let client = client_conn.as_ref().unwrap();
|
||||
Ok(client)
|
||||
},
|
||||
lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -477,6 +519,10 @@ pub enum PerDatabasePhase {
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -489,7 +535,14 @@ pub enum ApplySpecPhase {
|
||||
DisablePostgresDBPgAudit,
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
DropRoles,
|
||||
FinalizeDropLogicalSubscriptions,
|
||||
}
|
||||
@@ -525,6 +578,7 @@ pub async fn apply_operations<'a, Fut, F>(
|
||||
jwks_roles: Arc<HashSet<String>>,
|
||||
apply_spec_phase: ApplySpecPhase,
|
||||
client: F,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
@@ -571,6 +625,23 @@ where
|
||||
},
|
||||
query
|
||||
);
|
||||
if !lakebase_mode {
|
||||
return res;
|
||||
}
|
||||
// BEGIN HADRON
|
||||
if let Err(e) = res.as_ref() {
|
||||
if let Some(sql_state) = e.code() {
|
||||
if sql_state.code() == "57014" {
|
||||
// SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts.
|
||||
// Increment the counter whenever a statement timeout occurs. Timeouts on
|
||||
// this configuration path can only occur due to PS connectivity problems that
|
||||
// Postgres failed to recover from.
|
||||
COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
res
|
||||
}
|
||||
.instrument(inspan)
|
||||
@@ -608,10 +679,44 @@ async fn get_operations<'a>(
|
||||
ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
|
||||
query: format!(
|
||||
include_str!("sql/create_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
privileged_role_name = params.privileged_role_name,
|
||||
privileges = if params.lakebase_mode {
|
||||
"CREATEDB CREATEROLE NOLOGIN BYPASSRLS"
|
||||
} else {
|
||||
"CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS"
|
||||
}
|
||||
),
|
||||
comment: None,
|
||||
}))),
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phase
|
||||
ApplySpecPhase::CreateDatabricksRoles => {
|
||||
let queries = create_databricks_roles();
|
||||
let operations = queries.into_iter().map(|query| Operation {
|
||||
query,
|
||||
comment: None,
|
||||
});
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// Backfill existing databricks_reader_* roles with statement timeout from GUC
|
||||
ApplySpecPhase::AlterDatabricksRoles => {
|
||||
let query = String::from(include_str!(
|
||||
"sql/alter_databricks_reader_roles_timeout.sql"
|
||||
));
|
||||
|
||||
let operations = once(Operation {
|
||||
query,
|
||||
comment: Some(
|
||||
"Backfill existing databricks_reader_* roles with statement timeout"
|
||||
.to_string(),
|
||||
),
|
||||
});
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// End of new Hadron Phase
|
||||
// END_HADRON
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -981,7 +1086,10 @@ async fn get_operations<'a>(
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
)
|
||||
// HADRON change:
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
// HADRON change end ,
|
||||
comment: None,
|
||||
},
|
||||
// This now will only drop privileges of the role
|
||||
@@ -1017,7 +1125,8 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
query: String::from(include_str!("sql/default_grants.sql")),
|
||||
query: String::from(include_str!("sql/default_grants.sql"))
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
comment: None,
|
||||
},
|
||||
]
|
||||
@@ -1033,7 +1142,9 @@ async fn get_operations<'a>(
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
if libs.contains("pg_stat_statements") {
|
||||
return Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
|
||||
query: String::from(
|
||||
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
|
||||
),
|
||||
comment: Some(String::from("create system extensions")),
|
||||
})));
|
||||
}
|
||||
@@ -1041,11 +1152,13 @@ async fn get_operations<'a>(
|
||||
Ok(Box::new(empty()))
|
||||
}
|
||||
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
|
||||
comment: Some(String::from("create pgaudit extensions")),
|
||||
}))),
|
||||
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
|
||||
query: String::from(
|
||||
"CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
|
||||
),
|
||||
comment: Some(String::from("create pgauditlogtofile extensions")),
|
||||
}))),
|
||||
// Disable pgaudit logging for postgres database.
|
||||
@@ -1069,7 +1182,7 @@ async fn get_operations<'a>(
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
|
||||
"UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
|
||||
),
|
||||
comment: Some(String::from("compat/fix: make neon relocatable")),
|
||||
},
|
||||
@@ -1086,6 +1199,28 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// BEGIN_HADRON
|
||||
// Note: we may want to version the extension someday, but for now we just drop it and recreate it.
|
||||
ApplySpecPhase::HandleDatabricksAuthExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("DROP EXTENSION IF EXISTS databricks_auth"),
|
||||
comment: Some(String::from("dropping existing databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("CREATE EXTENSION databricks_auth"),
|
||||
comment: Some(String::from("creating databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"),
|
||||
comment: Some(String::from("grant select on databricks auth counters")),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// END_HADRON
|
||||
ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
|
||||
comment: None,
|
||||
@@ -1103,6 +1238,63 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phases
|
||||
//
|
||||
// Grants permissions to roles that are used by Databricks.
|
||||
ApplySpecPhase::AddDatabricksGrants => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute neon.* functions (in the postgres database)",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from("Permissions needed for read and write probes")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to monitor .snap file counts",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to access neon performance counters view",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute the underlying performance counters function",
|
||||
)),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// Creates minor objects that are used by Databricks.
|
||||
ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/create_databricks_misc.sql")),
|
||||
comment: Some(String::from(
|
||||
"The function databricks_monitor uses to convert exception to 0 or 1",
|
||||
)),
|
||||
}))),
|
||||
// End of new Hadron phases
|
||||
// END_HADRON
|
||||
ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
|
||||
comment: None,
|
||||
|
||||
@@ -3,16 +3,17 @@ BEGIN
|
||||
IF NOT EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_tables
|
||||
WHERE tablename = 'health_check'
|
||||
WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
|
||||
AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
CREATE TABLE health_check (
|
||||
id serial primary key,
|
||||
updated_at timestamptz default now()
|
||||
CREATE TABLE public.health_check (
|
||||
id pg_catalog.int4 primary key generated by default as identity,
|
||||
updated_at pg_catalog.timestamptz default pg_catalog.now()
|
||||
);
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();
|
||||
SET updated_at = pg_catalog.now();
|
||||
END IF;
|
||||
END
|
||||
$$
|
||||
@@ -0,0 +1,25 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
reader_role RECORD;
|
||||
timeout_value TEXT;
|
||||
BEGIN
|
||||
-- Get the current GUC setting for reader statement timeout
|
||||
SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value;
|
||||
|
||||
-- Only proceed if timeout_value is not null/empty and not '0' (disabled)
|
||||
IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN
|
||||
-- Find all databricks_reader_* roles and update their statement_timeout
|
||||
FOR reader_role IN
|
||||
SELECT r.rolname
|
||||
FROM pg_roles r
|
||||
WHERE r.rolname ~ '^databricks_reader_\d+$'
|
||||
LOOP
|
||||
-- Apply the timeout setting to the role (will overwrite existing setting)
|
||||
EXECUTE format('ALTER ROLE %I SET statement_timeout = %L',
|
||||
reader_role.rolname, timeout_value);
|
||||
|
||||
RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname;
|
||||
END LOOP;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
@@ -1,12 +0,0 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
query varchar;
|
||||
BEGIN
|
||||
FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
|
||||
FROM pg_proc p
|
||||
JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
|
||||
WHERE nsp.nspname = 'anon' LOOP
|
||||
EXECUTE query;
|
||||
END LOOP;
|
||||
END
|
||||
$$;
|
||||
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
ALTER ROLE databricks_monitor SET statement_timeout = '60s';
|
||||
|
||||
CREATE OR REPLACE FUNCTION health_check_write_succeeds()
|
||||
RETURNS INTEGER AS $$
|
||||
BEGIN
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();
|
||||
|
||||
RETURN 1;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM;
|
||||
RETURN 0;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -1,8 +1,8 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
|
||||
THEN
|
||||
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
@@ -4,14 +4,14 @@ $$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'
|
||||
) AND
|
||||
current_setting('server_version_num')::int / 10000 >= 15
|
||||
pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
|
||||
THEN
|
||||
IF EXISTS(
|
||||
SELECT rolname
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE rolname = 'web_access'
|
||||
WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
|
||||
)
|
||||
THEN
|
||||
GRANT CREATE ON SCHEMA public TO web_access;
|
||||
@@ -20,7 +20,7 @@ $$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'
|
||||
)
|
||||
THEN
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
|
||||
|
||||
@@ -2,11 +2,17 @@ DO ${outer_tag}$
|
||||
DECLARE
|
||||
subname TEXT;
|
||||
BEGIN
|
||||
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
|
||||
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
|
||||
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
|
||||
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
|
||||
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
|
||||
LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
|
||||
FOR subname IN
|
||||
SELECT pg_subscription.subname
|
||||
FROM pg_catalog.pg_subscription
|
||||
WHERE subdbid OPERATOR(pg_catalog.=) (
|
||||
SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
|
||||
)
|
||||
LOOP
|
||||
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
|
||||
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
|
||||
EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
|
||||
END LOOP;
|
||||
END;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -3,19 +3,19 @@ BEGIN
|
||||
IF NOT EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_tables
|
||||
WHERE tablename = 'drop_subscriptions_done'
|
||||
AND schemaname = 'neon'
|
||||
WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
|
||||
AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
CREATE TABLE neon.drop_subscriptions_done
|
||||
(id serial primary key, timeline_id text);
|
||||
(id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
|
||||
END IF;
|
||||
|
||||
-- preserve the timeline_id of the last drop_subscriptions run
|
||||
-- to ensure that the cleanup of a timeline is executed only once.
|
||||
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
|
||||
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
|
||||
INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET timeline_id = current_setting('neon.timeline_id');
|
||||
SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
|
||||
END
|
||||
$$
|
||||
|
||||
@@ -15,15 +15,15 @@ BEGIN
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
FOR grantor IN EXECUTE
|
||||
format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
|
||||
pg_catalog.format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
quote_literal({role_name})
|
||||
)
|
||||
LOOP
|
||||
EXECUTE format('SET LOCAL ROLE %I', grantor);
|
||||
EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
|
||||
|
||||
revoke_query := format(
|
||||
revoke_query := pg_catalog.format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
|
||||
schema,
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
|
||||
@@ -5,17 +5,17 @@ DO ${outer_tag}$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
SELECT nspowner::regrole::text
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
|
||||
INTO schema_owner;
|
||||
|
||||
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
|
||||
IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
|
||||
THEN
|
||||
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
|
||||
@@ -3,10 +3,10 @@ DO ${outer_tag}$
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {datname}
|
||||
WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
|
||||
EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
|
||||
END IF;
|
||||
END
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -19,6 +19,9 @@ use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
|
||||
use control_plane::endpoint::{
|
||||
local_pageserver_conf_to_conn_info, tenant_locate_response_to_conn_info,
|
||||
};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -44,7 +47,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
@@ -52,7 +54,6 @@ use safekeeper_api::{
|
||||
};
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use tokio::task::JoinSet;
|
||||
use url::Host;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -1547,62 +1548,41 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
)?;
|
||||
}
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id).unwrap();
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = futures::future::try_join_all(
|
||||
locate_result.shards.into_iter().map(|shard| async move {
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
// Initialize LSN leases for static computes.
|
||||
assert!(!locate_result.shards.is_empty());
|
||||
|
||||
// Initialize LSN leases for static computes.
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
futures::future::try_join_all(locate_result.shards.iter().map(
|
||||
|shard| async move {
|
||||
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
|
||||
let pageserver = PageServerNode::from_env(env, conf);
|
||||
|
||||
pageserver
|
||||
.http_client
|
||||
.timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
|
||||
.await?;
|
||||
}
|
||||
.await
|
||||
},
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
|
||||
let pageserver = if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
let stripe_size = locate_result.shard_params.stripe_size;
|
||||
|
||||
(pageservers, stripe_size)
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
assert!(!pageservers.is_empty());
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
||||
let auth_token = if matches!(
|
||||
@@ -1635,9 +1615,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
pageserver_conninfo,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
start_timeout: args.start_timeout,
|
||||
autoprewarm: args.autoprewarm,
|
||||
@@ -1654,37 +1633,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = match args.pageserver_id {
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = match args.pageserver_id {
|
||||
Some(pageserver_id) => {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(pageserver_id)?);
|
||||
|
||||
vec![(
|
||||
PageserverProtocol::Libpq,
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
let conf = env.get_pageserver_conf(pageserver_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
}
|
||||
None => {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
let locate_result =
|
||||
storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
}
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
endpoint.update_pageservers_in_config(pageservers).await?;
|
||||
endpoint
|
||||
.update_pageservers_in_config(&pageserver_conninfo)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Reconfigure(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
@@ -1692,51 +1663,30 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
vec![pageserver]
|
||||
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
// Use gRPC if requested.
|
||||
if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(&args.safekeepers)?;
|
||||
endpoint
|
||||
.reconfigure(Some(pageservers), None, safekeepers, None)
|
||||
.reconfigure(Some(&pageserver_conninfo), safekeepers, None)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::RefreshConfiguration(args) => {
|
||||
|
||||
@@ -48,8 +48,12 @@ use compute_api::responses::{
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
||||
PgIdent, RemoteExtSpec, Role,
|
||||
PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
|
||||
};
|
||||
|
||||
// re-export these, because they're used in the reconfigure() function
|
||||
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
|
||||
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse, RSAKeyParameters, RSAKeyType,
|
||||
@@ -64,7 +68,7 @@ use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use spki::der::Decode;
|
||||
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
@@ -73,11 +77,13 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::shard::ShardStripeSize;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
use x509_parser::parse_x509_certificate;
|
||||
|
||||
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
|
||||
use postgres_connection::parse_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
@@ -440,9 +446,8 @@ pub struct EndpointStartArgs {
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
pub start_timeout: Duration,
|
||||
pub autoprewarm: bool,
|
||||
@@ -715,14 +720,6 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
@@ -768,9 +765,6 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
@@ -785,6 +779,44 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
|
||||
//
|
||||
// XXX: I believe this is not really needed, except to make
|
||||
// test_forward_compatibility happy.
|
||||
//
|
||||
// Use a closure so that we can conviniently return None in the middle of the
|
||||
// loop.
|
||||
let pageserver_connstring: Option<String> = (|| {
|
||||
let num_shards = args.pageserver_conninfo.shard_count.count();
|
||||
let mut connstrings = Vec::new();
|
||||
for shard_no in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: args.pageserver_conninfo.shard_count,
|
||||
shard_number: ShardNumber(shard_no),
|
||||
};
|
||||
let shard = args
|
||||
.pageserver_conninfo
|
||||
.shards
|
||||
.get(&shard_index)
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"shard {} not found in pageserver_connection_info",
|
||||
shard_index
|
||||
)
|
||||
})?;
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow!("must have at least one pageserver"))?;
|
||||
if let Some(libpq_url) = &pageserver.libpq_url {
|
||||
connstrings.push(libpq_url.clone());
|
||||
} else {
|
||||
return Ok::<_, anyhow::Error>(None);
|
||||
}
|
||||
}
|
||||
Ok(Some(connstrings.join(",")))
|
||||
})()?;
|
||||
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
@@ -829,13 +861,14 @@ impl Endpoint {
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
|
||||
pageserver_connstring,
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(args.shard_stripe_size),
|
||||
shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
@@ -1019,7 +1052,7 @@ impl Endpoint {
|
||||
// Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario.
|
||||
pub async fn update_pageservers_in_config(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pageserver_conninfo: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let mut config: ComputeConfig = {
|
||||
@@ -1027,10 +1060,8 @@ impl Endpoint {
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
let mut spec = config.spec.unwrap();
|
||||
spec.pageserver_connstring = Some(pageserver_connstring);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
config.spec = Some(spec);
|
||||
|
||||
let file = std::fs::File::create(&config_path)?;
|
||||
@@ -1073,8 +1104,7 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageserver_conninfo: Option<&PageserverConnectionInfo>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
safekeeper_generation: Option<SafekeeperGeneration>,
|
||||
) -> Result<()> {
|
||||
@@ -1089,15 +1119,15 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If pageservers are not specified, don't change them.
|
||||
if let Some(pageservers) = pageservers {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
if let Some(pageserver_conninfo) = pageserver_conninfo {
|
||||
// If pageservers are provided, we need to ensure that they are not empty.
|
||||
// This is a requirement for the compute_ctl configuration.
|
||||
anyhow::ensure!(
|
||||
!pageserver_conninfo.shards.is_empty(),
|
||||
"no pageservers provided"
|
||||
);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
spec.shard_stripe_size = pageserver_conninfo.stripe_size;
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
@@ -1146,11 +1176,9 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure_pageservers(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageservers: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
||||
.await
|
||||
self.reconfigure(Some(pageservers), None, None).await
|
||||
}
|
||||
|
||||
pub async fn reconfigure_safekeepers(
|
||||
@@ -1158,7 +1186,7 @@ impl Endpoint {
|
||||
safekeepers: Vec<NodeId>,
|
||||
generation: SafekeeperGeneration,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(None, None, Some(safekeepers), Some(generation))
|
||||
self.reconfigure(None, Some(safekeepers), Some(generation))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1241,3 +1269,84 @@ impl Endpoint {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
/// fully managed by storage controller, therefore not sharded.
|
||||
pub fn local_pageserver_conf_to_conn_info(
|
||||
conf: &crate::local_env::PageServerConf,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let libpq_url = {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let ps_conninfo = PageserverShardConnectionInfo {
|
||||
id: Some(conf.id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![ps_conninfo],
|
||||
};
|
||||
|
||||
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: ShardCount::unsharded(),
|
||||
stripe_size: None,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tenant_locate_response_to_conn_info(
|
||||
response: &pageserver_api::controller_api::TenantLocateResponse,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let mut shards = HashMap::new();
|
||||
for shard in response.shards.iter() {
|
||||
tracing::info!("parsing {}", shard.listen_pg_addr);
|
||||
let libpq_url = {
|
||||
let host = &shard.listen_pg_addr;
|
||||
let port = shard.listen_pg_port;
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
|
||||
let host = grpc_addr;
|
||||
let port = shard.listen_grpc_port.expect("no gRPC port");
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: Some(shard.node_id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
}],
|
||||
};
|
||||
|
||||
shards.insert(shard.shard_id.to_index(), shard_info);
|
||||
}
|
||||
|
||||
let stripe_size = if response.shard_params.count.is_unsharded() {
|
||||
None
|
||||
} else {
|
||||
Some(response.shard_params.stripe_size)
|
||||
};
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: response.shard_params.count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
246
docs/rfcs/2025-07-07-node-deletion-api-improvement.md
Normal file
246
docs/rfcs/2025-07-07-node-deletion-api-improvement.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Node deletion API improvement
|
||||
|
||||
Created on 2025-07-07
|
||||
Implemented on _TBD_
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes improvements to the storage controller API for gracefully deleting pageserver
|
||||
nodes.
|
||||
|
||||
## Motivation
|
||||
|
||||
The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333)
|
||||
has several limitations:
|
||||
|
||||
- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and
|
||||
we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone
|
||||
mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036)
|
||||
- Process of node deletion is not graceful, i.e. it just imitates a node failure
|
||||
|
||||
In this context, "graceful" node deletion means that users do not experience any disruption or
|
||||
negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers
|
||||
can handle the workload and all requirements are met). To achieve this, the system must perform
|
||||
live migration of all tenant shards from the node being deleted while the node is still running
|
||||
and continue processing all incoming requests. The node is removed only after all tenant shards
|
||||
have been safely migrated.
|
||||
|
||||
Although live migrations can be achieved with the drain functionality, it leads to incorrect shard
|
||||
placement, such as not matching availability zones. This results in unnecessary work to optimize
|
||||
the placement that was just recently performed.
|
||||
|
||||
If we delete a node before its tenant shards are fully moved, the new node won't have all the
|
||||
needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at
|
||||
first. If there are many tenant shards, this slowdown affects a huge amount of users.
|
||||
|
||||
Graceful node deletion is more complicated and can introduce new issues. It takes longer because
|
||||
live migration of each tenant shard can last several minutes. Using non-blocking accessors may
|
||||
also cause deletion to wait if other processes are holding inner state lock. It also gets trickier
|
||||
because we need to handle other requests, like drain and fill, at the same time.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
- storage controller
|
||||
- pageserver (indirectly)
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### Tombstones
|
||||
|
||||
To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced
|
||||
as part of the node stored information. Each node has a separate `NodeLifecycle` field with two
|
||||
possible states: `Active` and `Deleted`. When node deletion completes, the database row is not
|
||||
deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted`
|
||||
lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach
|
||||
and register functionality must be aware of tombstones. Additionally, new debug handlers are
|
||||
available for listing and deleting tombstones via the `/debug/v1/tombstone` path.
|
||||
|
||||
### Gracefulness
|
||||
|
||||
The problem of making node deletion graceful is complex and involves several challenges:
|
||||
|
||||
- **Cancellable**: The operation must be cancellable to allow administrators to abort the process
|
||||
if needed, e.g. if run by mistake.
|
||||
- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node
|
||||
deletion process. We need clear policies for handling concurrent operations: what happens when a
|
||||
drain/fill request arrives while deletion is in progress, and what happens when a delete request
|
||||
arrives while drain/fill is in progress.
|
||||
- **Persistent**: If the storage controller restarts during this long-running operation, we must
|
||||
preserve progress and automatically resume the deletion process after the storage controller
|
||||
restarts.
|
||||
- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled
|
||||
for deletion, as this would move shards to irrelevant locations. The drain process expects the
|
||||
node to return, so it only moves shards to backup locations, not to their preferred AZs. It also
|
||||
leaves secondary locations unmoved. This could result in unnecessary load on the storage
|
||||
controller and inefficient resource utilization.
|
||||
- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when
|
||||
time constraints or emergency situations require it, bypassing the normal graceful migration
|
||||
process.
|
||||
|
||||
See below for a detailed breakdown of the proposed changes and mechanisms.
|
||||
|
||||
#### Node lifecycle
|
||||
|
||||
New `NodeLifecycle` enum and a matching database field with these values:
|
||||
- `Active`: The normal state. All operations are allowed.
|
||||
- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or
|
||||
will happen later, but the node will eventually be removed. All operations are allowed.
|
||||
- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought
|
||||
back. The only action left is to remove its record from the database. Any attempt to register a
|
||||
node in this state will fail.
|
||||
|
||||
This state persists across storage controller restarts.
|
||||
|
||||
**State transition**
|
||||
```
|
||||
+--------------------+
|
||||
+---| Active |<---------------------+
|
||||
| +--------------------+ |
|
||||
| ^ |
|
||||
| start_node_delete | cancel_node_delete |
|
||||
v | |
|
||||
+----------------------------------+ |
|
||||
| ScheduledForDeletion | |
|
||||
+----------------------------------+ |
|
||||
| |
|
||||
| node_register |
|
||||
| |
|
||||
| delete_node (at the finish) |
|
||||
| |
|
||||
v |
|
||||
+---------+ tombstone_delete +----------+
|
||||
| Deleted |-------------------------------->| no row |
|
||||
+---------+ +----------+
|
||||
```
|
||||
|
||||
#### NodeSchedulingPolicy::Deleting
|
||||
|
||||
A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is
|
||||
running for the node right now. Only one node can have the `Deleting` policy at a time.
|
||||
|
||||
The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage
|
||||
controller restart, any node previously marked as `Deleting` will have its scheduling policy reset
|
||||
to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is
|
||||
actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state.
|
||||
|
||||
`NodeSchedulingPolicy` transition details:
|
||||
1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`.
|
||||
2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the
|
||||
policy to its previous value. The policy is persisted in storcon DB.
|
||||
3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since
|
||||
`NodeLifecycle::Deleted` prevents any further access to this field.
|
||||
|
||||
The deletion process cannot be initiated for nodes currently undergoing deployment-related
|
||||
operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered
|
||||
once the node transitions to either the `Active` or `Pause` state.
|
||||
|
||||
#### OperationTracker
|
||||
|
||||
A replacement for `Option<OperationHandler> ongoing_operation`, the `OperationTracker` is a
|
||||
dedicated service state object responsible for managing all long-running node operations (drain,
|
||||
fill, delete) with robust concurrency control.
|
||||
|
||||
Key responsibilities:
|
||||
- Orchestrates the execution of operations
|
||||
- Supports cancellation of currently running operations
|
||||
- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time
|
||||
- Persists deletion state, enabling recovery of pending deletions across restarts
|
||||
- Ensures thread safety across concurrent requests
|
||||
|
||||
#### Attached tenant shard processing
|
||||
|
||||
When deleting a node, handle each attached tenant shard as follows:
|
||||
|
||||
1. Pick the best node to become the new attached (the candidate).
|
||||
2. If the candidate already has this shard as a secondary:
|
||||
- Create a new secondary for the shard on another suitable node.
|
||||
Otherwise:
|
||||
- Create a secondary for the shard on the candidate node.
|
||||
3. Wait until all secondaries are ready and pre-warmed.
|
||||
4. Promote the candidate's secondary to attached.
|
||||
5. Remove the secondary from the node being deleted.
|
||||
|
||||
This process safely moves all attached shards before deleting the node.
|
||||
|
||||
#### Secondary tenant shard processing
|
||||
|
||||
When deleting a node, handle each secondary tenant shard as follows:
|
||||
|
||||
1. Choose the best node to become the new secondary.
|
||||
2. Create a secondary for the shard on that node.
|
||||
3. Wait until the new secondary is ready.
|
||||
4. Remove the secondary from the node being deleted.
|
||||
|
||||
This ensures all secondary shards are safely moved before deleting the node.
|
||||
|
||||
### Reliability, failure modes and corner cases
|
||||
|
||||
In case of a storage controller failure and following restart, the system behavior depends on the
|
||||
`NodeLifecycle` state:
|
||||
|
||||
- If `NodeLifecycle` is `Active`: No action is taken for this node.
|
||||
- If `NodeLifecycle` is `Deleted`: The node will not be re-added.
|
||||
- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for
|
||||
this node.
|
||||
|
||||
In case of a pageserver node failure during deletion, the behavior depends on the `force` flag:
|
||||
- If `force` is set: The node deletion will proceed regardless of the node's availability.
|
||||
- If `force` is not set: The deletion will be retried a limited number of times. If the node
|
||||
remains unavailable, the deletion process will pause and automatically resume when the node
|
||||
becomes healthy again.
|
||||
|
||||
### Operations concurrency
|
||||
|
||||
The following sections describe the behavior when different types of requests arrive at the storage
|
||||
controller and how they interact with ongoing operations.
|
||||
|
||||
#### Delete request
|
||||
|
||||
Handler: `PUT /control/v1/node/:node_id/delete`
|
||||
|
||||
1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`:
|
||||
- Return `200 OK`: there is already an ongoing deletion request for this node
|
||||
2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion`
|
||||
3. Persist current scheduling policy
|
||||
4. If there is no active operation (drain/fill/delete):
|
||||
- Run deletion process for this node
|
||||
|
||||
#### Cancel delete request
|
||||
|
||||
Handler: `DELETE /control/v1/node/:node_id/delete`
|
||||
|
||||
1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`:
|
||||
- Return `404 Not Found`: there is no current deletion request for this node
|
||||
2. If the active operation is deleting this node, cancel it
|
||||
3. Update & persist lifecycle to `NodeLifecycle::Active`
|
||||
4. Restore the last scheduling policy from persistence
|
||||
|
||||
#### Drain/fill request
|
||||
|
||||
1. If there are already ongoing drain/fill processes:
|
||||
- Return `409 Conflict`: queueing of drain/fill processes is not supported
|
||||
2. If there is an ongoing delete process:
|
||||
- Cancel it and wait until it is cancelled
|
||||
3. Run the drain/fill process
|
||||
4. After the drain/fill process is cancelled or finished:
|
||||
- Try to find another candidate to delete and run the deletion process for that node
|
||||
|
||||
#### Drain/fill cancel request
|
||||
|
||||
1. If the active operation is not the related process:
|
||||
- Return `400 Bad Request`: cancellation request is incorrect, operations are not the same
|
||||
2. Cancel the active operation
|
||||
3. Try to find another candidate to delete and run the deletion process for that node
|
||||
|
||||
## Definition of Done
|
||||
|
||||
- [x] Fix flaky node scenario and introduce related debug handlers
|
||||
- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion
|
||||
request regardless of draining/filling requests and restarts
|
||||
- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to
|
||||
recommended locations
|
||||
- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion
|
||||
process and deletion resumes after drain/fill completes
|
||||
- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a
|
||||
pageserver node does not respond)
|
||||
- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli
|
||||
@@ -68,11 +68,15 @@ pub enum LfcPrewarmState {
|
||||
/// We tried to fetch the corresponding LFC state from the endpoint storage,
|
||||
/// but received `Not Found 404`. This should normally happen only during the
|
||||
/// first endpoint start after creation with `autoprewarm: true`.
|
||||
/// This may also happen if LFC is turned off or not initialized
|
||||
///
|
||||
/// During the orchestrated prewarm via API, when a caller explicitly
|
||||
/// provides the LFC state key to prewarm from, it's the caller responsibility
|
||||
/// to handle this status as an error state in this case.
|
||||
Skipped,
|
||||
/// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query
|
||||
/// has started working before cancellation
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
@@ -83,6 +87,7 @@ impl Display for LfcPrewarmState {
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Skipped => f.write_str("Skipped"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -97,6 +102,7 @@ pub enum LfcOffloadState {
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
Skipped,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
|
||||
@@ -12,8 +12,9 @@ use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::responses::TlsConfig;
|
||||
|
||||
@@ -105,8 +106,27 @@ pub struct ComputeSpec {
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Pageserver information can be passed in three different ways:
|
||||
/// 1. Here in `pageserver_connection_info`
|
||||
/// 2. In the `pageserver_connstring` field.
|
||||
/// 3. in `cluster.settings`.
|
||||
///
|
||||
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
|
||||
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
|
||||
/// `pageserver_connection_info` field is missing.
|
||||
///
|
||||
/// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are
|
||||
/// given, they must contain the same information.
|
||||
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
/// Stripe size for pageserver sharding, in pages. This is set together with the legacy
|
||||
/// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used,
|
||||
/// the stripe size is stored in `pageserver_connection_info.stripe_size` instead.
|
||||
pub shard_stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
@@ -139,10 +159,6 @@ pub struct ComputeSpec {
|
||||
|
||||
pub pgbouncer_settings: Option<IndexMap<String, String>>,
|
||||
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
/// Local Proxy configuration used for JWT authentication
|
||||
#[serde(default)]
|
||||
pub local_proxy_config: Option<LocalProxySpec>,
|
||||
@@ -217,6 +233,140 @@ pub enum ComputeFeature {
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverConnectionInfo {
|
||||
/// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
|
||||
pub shard_count: ShardCount,
|
||||
|
||||
/// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
|
||||
|
||||
/// If the compute supports both protocols, this indicates which one it should use. The compute
|
||||
/// may use other available protocols too, if it doesn't support the preferred one. The URL's
|
||||
/// for the protocol specified here must be present for all shards, i.e. do not mark a protocol
|
||||
/// as preferred if it cannot actually be used with all the pageservers.
|
||||
#[serde(default)]
|
||||
pub prefer_protocol: PageserverProtocol,
|
||||
}
|
||||
|
||||
/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
|
||||
///
|
||||
/// This is used for backwards-compatibility, to parse the legacy
|
||||
/// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays,
|
||||
/// the 'pageserver_connection_info' field should be used instead.
|
||||
impl PageserverConnectionInfo {
|
||||
pub fn from_connstr(
|
||||
connstr: &str,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<PageserverConnectionInfo, anyhow::Error> {
|
||||
let shard_infos: Vec<_> = connstr
|
||||
.split(',')
|
||||
.map(|connstr| PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: None,
|
||||
libpq_url: Some(connstr.to_string()),
|
||||
grpc_url: None,
|
||||
}],
|
||||
})
|
||||
.collect();
|
||||
|
||||
match shard_infos.len() {
|
||||
0 => anyhow::bail!("empty connection string"),
|
||||
1 => {
|
||||
// We assume that if there's only connection string, it means "unsharded",
|
||||
// rather than a sharded system with just a single shard. The latter is
|
||||
// possible in principle, but we never do it.
|
||||
let shard_count = ShardCount::unsharded();
|
||||
let only_shard = shard_infos.first().unwrap().clone();
|
||||
let shards = vec![(ShardIndex::unsharded(), only_shard)];
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size: None,
|
||||
shards: shards.into_iter().collect(),
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
n => {
|
||||
if stripe_size.is_none() {
|
||||
anyhow::bail!("{n} shards but no stripe_size");
|
||||
}
|
||||
let shard_count = ShardCount(n.try_into()?);
|
||||
let shards = shard_infos
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, shard_info)| {
|
||||
(
|
||||
ShardIndex {
|
||||
shard_count,
|
||||
shard_number: ShardNumber(
|
||||
idx.try_into().expect("shard number fits in u8"),
|
||||
),
|
||||
},
|
||||
shard_info,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience routine to get the connection string for a shard.
|
||||
pub fn shard_url(
|
||||
&self,
|
||||
shard_number: ShardNumber,
|
||||
protocol: PageserverProtocol,
|
||||
) -> anyhow::Result<&str> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number,
|
||||
shard_count: self.shard_count,
|
||||
};
|
||||
let shard = self.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!("shard connection info missing for shard {}", shard_index)
|
||||
})?;
|
||||
|
||||
// Just use the first pageserver in the list. That's good enough for this
|
||||
// convenience routine; if you need more control, like round robin policy or
|
||||
// failover support, roll your own. (As of this writing, we never have more than
|
||||
// one pageserver per shard anyway, but that will change in the future.)
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow::anyhow!("must have at least one pageserver"))?;
|
||||
|
||||
let result = match protocol {
|
||||
PageserverProtocol::Grpc => pageserver
|
||||
.grpc_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?,
|
||||
PageserverProtocol::Libpq => pageserver
|
||||
.libpq_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?,
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardInfo {
|
||||
pub pageservers: Vec<PageserverShardConnectionInfo>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardConnectionInfo {
|
||||
pub id: Option<NodeId>,
|
||||
pub libpq_url: Option<String>,
|
||||
pub grpc_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -334,6 +484,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
@@ -470,13 +626,15 @@ pub struct JwksSettings {
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
#[serde(rename = "libpq")]
|
||||
Libpq,
|
||||
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
|
||||
#[serde(rename = "grpc")]
|
||||
Grpc,
|
||||
}
|
||||
|
||||
|
||||
@@ -9,9 +9,7 @@ regex.workspace = true
|
||||
bytes.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
criterion.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
pprof.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -21,6 +19,7 @@ tracing.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
|
||||
@@ -4,12 +4,11 @@
|
||||
use crate::pg_constants;
|
||||
use crate::transaction_id_precedes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
use super::bindings::MultiXactId;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
tracing::trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
@@ -14,7 +14,6 @@ use super::xlog_utils::*;
|
||||
use crate::WAL_SEGMENT_SIZE;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::num::NonZeroU32;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -236,7 +235,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder {
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
let next_lsn = if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
tracing::trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64)
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
|
||||
@@ -23,8 +23,6 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::{CString, OsStr};
|
||||
use std::fs::File;
|
||||
@@ -235,7 +233,7 @@ pub fn find_end_of_wal(
|
||||
let mut curr_lsn = start_lsn;
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let pg_version = MY_PGVERSION;
|
||||
debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
|
||||
|
||||
@@ -247,7 +245,7 @@ pub fn find_end_of_wal(
|
||||
match open_wal_segment(&seg_file_path)? {
|
||||
None => {
|
||||
// no more segments
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
|
||||
result, seg_file_path
|
||||
);
|
||||
@@ -260,7 +258,7 @@ pub fn find_end_of_wal(
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
@@ -276,7 +274,7 @@ pub fn find_end_of_wal(
|
||||
match decoder.poll_decode() {
|
||||
Ok(Some(record)) => result = record.0,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
||||
result, e
|
||||
);
|
||||
|
||||
@@ -9,7 +9,7 @@ use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
|
||||
pub use self::sqlstate::*;
|
||||
|
||||
#[allow(clippy::unreadable_literal)]
|
||||
mod sqlstate;
|
||||
pub mod sqlstate;
|
||||
|
||||
/// The severity of a Postgres error or notice.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
|
||||
@@ -34,13 +34,16 @@ macro_rules! critical {
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! critical_timeline {
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) {
|
||||
panic!($($arg)*);
|
||||
}
|
||||
// Increment both metrics
|
||||
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
|
||||
$crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
|
||||
if let Some(c) = $corruption_detected.as_ref() {
|
||||
c.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
|
||||
$tenant_shard_id, $timeline_id, format!($($arg)*));
|
||||
|
||||
@@ -32,6 +32,9 @@ pub struct PageserverFeedback {
|
||||
pub replytime: SystemTime,
|
||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||
pub shard_number: u32,
|
||||
/// If true, the pageserver has detected corruption and the safekeeper and postgres
|
||||
/// should stop sending WAL.
|
||||
pub corruption_detected: bool,
|
||||
}
|
||||
|
||||
impl PageserverFeedback {
|
||||
@@ -43,6 +46,7 @@ impl PageserverFeedback {
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +105,13 @@ impl PageserverFeedback {
|
||||
buf.put_u32(self.shard_number);
|
||||
}
|
||||
|
||||
if self.corruption_detected {
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"corruption_detected\0");
|
||||
buf.put_i32(1);
|
||||
buf.put_u8(1);
|
||||
}
|
||||
|
||||
buf[buf_ptr] = nkeys;
|
||||
}
|
||||
|
||||
@@ -147,6 +158,11 @@ impl PageserverFeedback {
|
||||
assert_eq!(len, 4);
|
||||
rf.shard_number = buf.get_u32();
|
||||
}
|
||||
b"corruption_detected" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 1);
|
||||
rf.corruption_detected = buf.get_u8() != 0;
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
@@ -206,6 +222,26 @@ mod tests {
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
// Test that databricks-specific fields added to the PageserverFeedback message are serialized
|
||||
// and deserialized correctly, in addition to the existing fields from upstream.
|
||||
#[test]
|
||||
fn test_replication_feedback_databricks_fields() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
rf.current_timeline_size = 12345678;
|
||||
rf.last_received_lsn = Lsn(23456789);
|
||||
rf.disk_consistent_lsn = Lsn(34567890);
|
||||
rf.remote_consistent_lsn = Lsn(45678901);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.shard_number = 1;
|
||||
rf.corruption_detected = true;
|
||||
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
|
||||
@@ -59,6 +59,10 @@ impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
|
||||
pub fn unsharded() -> Self {
|
||||
ShardCount(0)
|
||||
}
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
|
||||
@@ -341,6 +341,34 @@ extern "C-unwind" fn log_internal(
|
||||
}
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
if api.is_null() {
|
||||
return;
|
||||
}
|
||||
(*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn update_safekeeper_status_for_metrics(
|
||||
wp: *mut WalProposer,
|
||||
sk_index: u32,
|
||||
status: u8,
|
||||
) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
if api.is_null() {
|
||||
return;
|
||||
}
|
||||
(*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status);
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
@@ -414,6 +442,10 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
log_internal: Some(log_internal),
|
||||
/* BEGIN_HADRON */
|
||||
reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics),
|
||||
update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics),
|
||||
/* END_HADRON */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -426,6 +458,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
remote_consistent_lsn: 0,
|
||||
replytime: 0,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
};
|
||||
|
||||
let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
|
||||
@@ -450,6 +483,8 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
wal_rate_limiter: empty_wal_rate_limiter,
|
||||
num_safekeepers: 0,
|
||||
safekeeper_status: [0; 32],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -159,6 +159,21 @@ pub trait ApiImpl {
|
||||
fn after_election(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) {
|
||||
// Do nothing for testing purposes.
|
||||
}
|
||||
|
||||
fn update_safekeeper_status_for_metrics(
|
||||
&self,
|
||||
_wp: &mut WalProposer,
|
||||
_sk_index: u32,
|
||||
_status: u8,
|
||||
) {
|
||||
// Do nothing for testing purposes.
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -14,9 +14,9 @@ use utils::logging::warn_slow;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::GetPageSplitter;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
@@ -230,16 +230,14 @@ impl PageserverClient {
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)?
|
||||
{
|
||||
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_id, shard_req) in splitter.drain_requests() {
|
||||
@@ -249,14 +247,10 @@ impl PageserverClient {
|
||||
}
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter
|
||||
.add_response(shard_id, shard_response)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter
|
||||
.get_response()
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
|
||||
/// Fetches pages on the given shard. Does not retry internally.
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
|
||||
@@ -19,7 +19,9 @@ pub mod proto {
|
||||
}
|
||||
|
||||
mod client;
|
||||
pub use client::Client;
|
||||
mod model;
|
||||
mod split;
|
||||
|
||||
pub use client::Client;
|
||||
pub use model::*;
|
||||
pub use split::{GetPageSplitter, SplitError};
|
||||
|
||||
@@ -1,20 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::model::*;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::key_to_shard_number;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardStripeSize};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
requests: HashMap<ShardIndex, GetPageRequest>,
|
||||
/// The response being assembled. Preallocated with empty pages, to be filled in.
|
||||
response: page_api::GetPageResponse,
|
||||
response: GetPageResponse,
|
||||
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
|
||||
/// to assemble the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
@@ -24,22 +23,22 @@ impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
pub fn for_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
req: &GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Option<ShardIndex>> {
|
||||
) -> Result<Option<ShardIndex>, SplitError> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Ok(Some(ShardIndex::unsharded()));
|
||||
}
|
||||
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Find the first page's shard, for comparison.
|
||||
let Some(&first_page) = req.block_numbers.first() else {
|
||||
return Err(anyhow!("no block numbers in request"));
|
||||
return Err("no block numbers in request".into());
|
||||
};
|
||||
let key = rel_block_to_key(req.rel, first_page);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
@@ -57,10 +56,10 @@ impl GetPageSplitter {
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
req: GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Result<Self, SplitError> {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::for_single_shard(&req, count, stripe_size)?.is_none(),
|
||||
@@ -68,10 +67,10 @@ impl GetPageSplitter {
|
||||
);
|
||||
|
||||
if count.is_unsharded() {
|
||||
return Err(anyhow!("unsharded tenant, no point in splitting request"));
|
||||
return Err("unsharded tenant, no point in splitting request".into());
|
||||
}
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Split the requests by shard index.
|
||||
@@ -84,7 +83,7 @@ impl GetPageSplitter {
|
||||
|
||||
requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
.or_insert_with(|| GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
@@ -98,16 +97,16 @@ impl GetPageSplitter {
|
||||
|
||||
// Construct a response to be populated by shard responses. Preallocate empty page slots
|
||||
// with the expected block numbers.
|
||||
let response = page_api::GetPageResponse {
|
||||
let response = GetPageResponse {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
status_code: GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
rel: req.rel,
|
||||
pages: req
|
||||
.block_numbers
|
||||
.into_iter()
|
||||
.map(|block_number| {
|
||||
page_api::Page {
|
||||
Page {
|
||||
block_number,
|
||||
image: Bytes::new(), // empty page slot to be filled in
|
||||
}
|
||||
@@ -123,43 +122,38 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
pub fn drain_requests(&mut self) -> impl Iterator<Item = (ShardIndex, GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard. The response must match the request ID and have an OK
|
||||
/// status code. A response must not already exist for the given shard ID.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> anyhow::Result<()> {
|
||||
response: GetPageResponse,
|
||||
) -> Result<(), SplitError> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
if response.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(anyhow!(
|
||||
if response.status_code != GetPageStatusCode::Ok {
|
||||
return Err(SplitError(format!(
|
||||
"unexpected non-OK response for shard {shard_id}: {} {}",
|
||||
response.status_code,
|
||||
response.reason.unwrap_or_default()
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Place the shard response pages into the assembled response, in request order.
|
||||
@@ -171,26 +165,27 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
let Some(slot) = self.response.pages.get_mut(i) else {
|
||||
return Err(anyhow!("no block_shards slot {i} for shard {shard_id}"));
|
||||
return Err(SplitError(format!(
|
||||
"no block_shards slot {i} for shard {shard_id}"
|
||||
)));
|
||||
};
|
||||
let Some(page) = pages.next() else {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} in shard {shard_id} response",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
};
|
||||
if page.block_number != slot.block_number {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned wrong page at index {i}, expected {} got {}",
|
||||
slot.block_number,
|
||||
page.block_number
|
||||
));
|
||||
slot.block_number, page.block_number
|
||||
)));
|
||||
}
|
||||
if !slot.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned duplicate page {} at index {i}",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
*slot = page;
|
||||
@@ -198,32 +193,54 @@ impl GetPageSplitter {
|
||||
|
||||
// Make sure we've consumed all pages from the shard response.
|
||||
if let Some(extra_page) = pages.next() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned extra page: {}",
|
||||
extra_page.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches the final, assembled response.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn get_response(self) -> anyhow::Result<page_api::GetPageResponse> {
|
||||
/// Collects the final, assembled response.
|
||||
pub fn collect_response(self) -> Result<GetPageResponse, SplitError> {
|
||||
// Check that the response is complete.
|
||||
for (i, page) in self.response.pages.iter().enumerate() {
|
||||
if page.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} for shard {}",
|
||||
page.block_number,
|
||||
self.block_shards
|
||||
.get(i)
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
));
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.response)
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPageSplitter error.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{0}")]
|
||||
pub struct SplitError(String);
|
||||
|
||||
impl From<&str> for SplitError {
|
||||
fn from(err: &str) -> Self {
|
||||
SplitError(err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for SplitError {
|
||||
fn from(err: String) -> Self {
|
||||
SplitError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SplitError> for tonic::Status {
|
||||
fn from(err: SplitError) -> Self {
|
||||
tonic::Status::internal(err.0)
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,8 @@ use anyhow::{Context as _, bail};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, Stream, StreamExt as _};
|
||||
use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -35,8 +36,8 @@ use pageserver_api::pagestream_api::{
|
||||
};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::proto;
|
||||
use pageserver_page_api::{self as page_api, GetPageSplitter};
|
||||
use postgres_backend::{
|
||||
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
|
||||
};
|
||||
@@ -466,13 +467,6 @@ impl TimelineHandles {
|
||||
self.handles
|
||||
.get(timeline_id, shard_selector, &self.wrapper)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
timeline::handle::GetError::TenantManager(e) => e,
|
||||
timeline::handle::GetError::PerTimelineStateShutDown => {
|
||||
trace!("per-timeline state shut down");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_id(&self) -> Option<TenantId> {
|
||||
@@ -488,11 +482,9 @@ pub(crate) struct TenantManagerWrapper {
|
||||
tenant_id: once_cell::sync::OnceCell<TenantId>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TenantManagerTypes;
|
||||
|
||||
impl timeline::handle::Types for TenantManagerTypes {
|
||||
type TenantManagerError = GetActiveTimelineError;
|
||||
type TenantManager = TenantManagerWrapper;
|
||||
type Timeline = TenantManagerCacheItem;
|
||||
}
|
||||
@@ -3432,18 +3424,6 @@ impl GrpcPageServiceHandler {
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
|
||||
/// relations and their sizes, as well as SLRU segments and similar data.
|
||||
#[allow(clippy::result_large_err)]
|
||||
fn ensure_shard_zero(timeline: &Handle<TenantManagerTypes>) -> Result<(), tonic::Status> {
|
||||
match timeline.get_shard_index().shard_number.0 {
|
||||
0 => Ok(()),
|
||||
shard => Err(tonic::Status::invalid_argument(format!(
|
||||
"request must execute on shard zero (is shard {shard})",
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates a PagestreamRequest header from a ReadLsn and request ID.
|
||||
fn make_hdr(
|
||||
read_lsn: page_api::ReadLsn,
|
||||
@@ -3458,30 +3438,72 @@ impl GrpcPageServiceHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request.
|
||||
/// Acquires a timeline handle for the given request. The shard index must match a local shard.
|
||||
///
|
||||
/// TODO: during shard splits, the compute may still be sending requests to the parent shard
|
||||
/// until the entire split is committed and the compute is notified. Consider installing a
|
||||
/// temporary shard router from the parent to the children while the split is in progress.
|
||||
///
|
||||
/// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage
|
||||
/// the TimelineHandles lifecycle.
|
||||
///
|
||||
/// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid
|
||||
/// the unnecessary overhead.
|
||||
/// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`].
|
||||
async fn get_request_timeline(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, GetActiveTimelineError> {
|
||||
let ttid = *extract::<TenantTimelineId>(req);
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
TimelineHandles::new(self.tenant_manager.clone())
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request, which must be for shard zero. Most
|
||||
/// metadata requests are only valid on shard zero.
|
||||
///
|
||||
/// NB: during an ongoing shard split, the compute will keep talking to the parent shard until
|
||||
/// the split is committed, but the parent shard may have been removed in the meanwhile. In that
|
||||
/// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`].
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this child routing.
|
||||
async fn get_request_timeline_shard_zero(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, tonic::Status> {
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
|
||||
if shard_index.shard_number.0 != 0 {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"request only valid on shard zero (requested shard {shard_index})",
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
Ok(timeline) => Ok(timeline),
|
||||
Err(err) => {
|
||||
// We may be in the middle of a shard split. Try to find a child shard 0.
|
||||
if let Ok(timeline) = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await
|
||||
&& timeline.get_shard_index().shard_count > shard_index.shard_count
|
||||
{
|
||||
return Ok(timeline);
|
||||
}
|
||||
Err(err.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start.
|
||||
/// Only errors if the timeline is shutting down.
|
||||
///
|
||||
@@ -3511,28 +3533,22 @@ impl GrpcPageServiceHandler {
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
async fn get_page(
|
||||
ctx: &RequestContext,
|
||||
timeline: &WeakHandle<TenantManagerTypes>,
|
||||
req: proto::GetPageRequest,
|
||||
timeline: Handle<TenantManagerTypes>,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<proto::GetPageResponse, tonic::Status> {
|
||||
let received_at = Instant::now();
|
||||
let timeline = timeline.upgrade()?;
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
span_record!(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
);
|
||||
|
||||
for &blkno in &req.block_numbers {
|
||||
let shard = timeline.get_shard_identity();
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
@@ -3620,7 +3636,89 @@ impl GrpcPageServiceHandler {
|
||||
};
|
||||
}
|
||||
|
||||
Ok(resp.into())
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Processes a GetPage request when there is a potential shard split in progress. We have to
|
||||
/// reroute the request to any local child shards, and split batch requests that straddle
|
||||
/// multiple child shards.
|
||||
///
|
||||
/// Parent shards are split and removed incrementally (there may be many parent shards when
|
||||
/// splitting an already-sharded tenant), but the compute is only notified once the overall
|
||||
/// split commits, which can take several minutes. In the meanwhile, the compute will be sending
|
||||
/// requests to the parent shards.
|
||||
///
|
||||
/// TODO: add test infrastructure to provoke this situation frequently and for long periods of
|
||||
/// time, to properly exercise it.
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this, e.g.:
|
||||
/// * Keep the parent shard until the split commits and the compute is notified.
|
||||
/// * Notify the compute about each subsplit.
|
||||
/// * Return an error that updates the compute's shard map.
|
||||
#[instrument(skip_all)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn maybe_split_get_page(
|
||||
ctx: &RequestContext,
|
||||
handles: &mut TimelineHandles,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
parent: ShardIndex,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
// Check the first page to see if we have any child shards at all. Otherwise, the compute is
|
||||
// just talking to the wrong Pageserver. If the parent has been split, the shard now owning
|
||||
// the page must have a higher shard count.
|
||||
let timeline = handles
|
||||
.get(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let shard_id = timeline.get_shard_identity();
|
||||
if shard_id.count <= parent.shard_count {
|
||||
return Err(HandleUpgradeError::ShutDown.into()); // emulate original error
|
||||
}
|
||||
|
||||
// Fast path: the request fits in a single shard.
|
||||
if let Some(shard_index) =
|
||||
GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))?
|
||||
{
|
||||
// We got the shard ID from the first page, so these must be equal.
|
||||
assert_eq!(shard_index.shard_number, shard_id.number);
|
||||
assert_eq!(shard_index.shard_count, shard_id.count);
|
||||
return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await;
|
||||
}
|
||||
|
||||
// The request spans multiple shards; split it and dispatch parallel requests. All pages
|
||||
// were originally in the parent shard, and during a split all children are local, so we
|
||||
// expect to find local shards for all pages.
|
||||
let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_index, shard_req) in splitter.drain_requests() {
|
||||
let timeline = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await?;
|
||||
let future = Self::get_page(
|
||||
ctx,
|
||||
timeline,
|
||||
shard_req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.map(move |result| result.map(|resp| (shard_index, resp)));
|
||||
shard_requests.push(future);
|
||||
}
|
||||
|
||||
while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_index, shard_response)?;
|
||||
}
|
||||
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3649,11 +3747,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
// to be the sweet spot where throughput is saturated.
|
||||
const CHUNK_SIZE: usize = 256 * 1024;
|
||||
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate the request and decorate the span.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
@@ -3769,11 +3866,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetDbSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetDbSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
|
||||
@@ -3802,14 +3898,29 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<tonic::Streaming<proto::GetPageRequest>>,
|
||||
) -> Result<tonic::Response<Self::GetPagesStream>, tonic::Status> {
|
||||
// Extract the timeline from the request and check that it exists.
|
||||
let ttid = *extract::<TenantTimelineId>(&req);
|
||||
//
|
||||
// NB: during shard splits, the compute may still send requests to the parent shard. We'll
|
||||
// reroute requests to the child shards below, but we also detect the common cases here
|
||||
// where either the shard exists or no shards exist at all. If we have a child shard, we
|
||||
// can't acquire a weak handle because we don't know which child shard to use yet.
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(&req);
|
||||
let shard_index = *extract::<ShardIndex>(&req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?;
|
||||
let timeline = match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
// The timeline shard exists. Keep a weak handle to reuse for each request.
|
||||
Ok(timeline) => Some(timeline.downgrade()),
|
||||
// The shard doesn't exist, but a child shard does. We'll reroute requests later.
|
||||
Err(_) if self.tenant_manager.has_child_shard(tenant_id, shard_index) => None,
|
||||
// Failed to fetch the timeline, and no child shard exists. Error out.
|
||||
Err(err) => return Err(err.into()),
|
||||
};
|
||||
|
||||
// Spawn an IoConcurrency sidecar, if enabled.
|
||||
let gate_guard = self
|
||||
@@ -3826,11 +3937,9 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let mut reqs = req.into_inner();
|
||||
|
||||
let resps = async_stream::try_stream! {
|
||||
let timeline = handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?
|
||||
.downgrade();
|
||||
loop {
|
||||
// Wait for the next client request.
|
||||
//
|
||||
// NB: Tonic considers the entire stream to be an in-flight request and will wait
|
||||
// for it to complete before shutting down. React to cancellation between requests.
|
||||
let req = tokio::select! {
|
||||
@@ -3843,16 +3952,44 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
Err(err) => Err(err),
|
||||
},
|
||||
}?;
|
||||
|
||||
let received_at = Instant::now();
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
|
||||
// Process the request, using a closure to capture errors.
|
||||
let process_request = async || {
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
// Fast path: use the pre-acquired timeline handle.
|
||||
if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) {
|
||||
return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await
|
||||
}
|
||||
|
||||
// The timeline handle is stale. During shard splits, the compute may still be
|
||||
// sending requests to the parent shard. Try to re-route requests to the child
|
||||
// shards, and split any batch requests that straddle multiple child shards.
|
||||
Self::maybe_split_get_page(
|
||||
&ctx,
|
||||
&mut handles,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_index,
|
||||
req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
.await
|
||||
};
|
||||
|
||||
// Return the response. Convert per-request errors to GetPageResponses if
|
||||
// appropriate, or terminate the stream with a tonic::Status.
|
||||
yield match process_request().await {
|
||||
Ok(resp) => resp.into(),
|
||||
Err(status) => {
|
||||
// Log the error, since ObservabilityLayer won't see stream errors.
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
@@ -3872,11 +4009,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetRelSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetRelSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
|
||||
let allow_missing = req.allow_missing;
|
||||
|
||||
@@ -3909,11 +4045,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetSlruSegmentRequest>,
|
||||
) -> Result<tonic::Response<proto::GetSlruSegmentResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
|
||||
@@ -3943,6 +4078,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
&self,
|
||||
req: tonic::Request<proto::LeaseLsnRequest>,
|
||||
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
|
||||
// TODO: this won't work during shard splits, as the request is directed at a specific shard
|
||||
// but the parent shard is removed before the split commits and the compute is notified
|
||||
// (which can take several minutes for large tenants). That's also the case for the libpq
|
||||
// implementation, so we keep the behavior for now.
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
|
||||
@@ -826,6 +826,18 @@ impl TenantManager {
|
||||
peek_slot.is_some()
|
||||
}
|
||||
|
||||
/// Returns whether a local shard exists that's a child of the given tenant shard. Note that
|
||||
/// this just checks for any shard with a larger shard count, and it may not be a direct child
|
||||
/// of the given shard (their keyspace may not overlap).
|
||||
pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool {
|
||||
match &*self.tenants.read().unwrap() {
|
||||
TenantsMap::Initializing => false,
|
||||
TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.any(|(tsid, _)| tsid.shard_count > shard_index.shard_count),
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
@@ -1522,6 +1534,13 @@ impl TenantManager {
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
//
|
||||
// TODO: keeping the parent as InProgress while spawning the children causes read
|
||||
// unavailability, as we can't acquire a new timeline handle for it (existing handles appear
|
||||
// to still work though, even downgraded ones). The parent should be available for reads
|
||||
// until the children are ready -- potentially until *all* subsplits across all parent
|
||||
// shards are complete and the compute has been notified. See:
|
||||
// <https://databricks.atlassian.net/browse/LKB-672>.
|
||||
drop(tenant);
|
||||
let mut parent_slot_guard =
|
||||
self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
|
||||
@@ -397,6 +397,11 @@ pub struct Timeline {
|
||||
/// If true, the last compaction failed.
|
||||
compaction_failed: AtomicBool,
|
||||
|
||||
/// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline.
|
||||
/// We need to feed this information back to the Safekeeper and postgres for them to take the
|
||||
/// appropriate action.
|
||||
corruption_detected: AtomicBool,
|
||||
|
||||
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
|
||||
l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
@@ -3310,6 +3315,7 @@ impl Timeline {
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
compaction_failed: AtomicBool::default(),
|
||||
corruption_detected: AtomicBool::default(),
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
@@ -6004,6 +6010,17 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
// Begin Hadron
|
||||
//
|
||||
fail_point!("create-image-layer-fail-simulated-corruption", |_| {
|
||||
self.corruption_detected
|
||||
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint create-image-layer-fail-simulated-corruption"
|
||||
)))
|
||||
});
|
||||
// End Hadron
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.gate
|
||||
@@ -7149,6 +7166,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"walredo failure during page reconstruction: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1397,6 +1397,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"missing key during compaction: {err:?}"
|
||||
);
|
||||
}
|
||||
@@ -1441,6 +1442,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"could not compact, repartitioning keyspace failed: {e:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -224,11 +224,11 @@ use tracing::{instrument, trace};
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::{ShardIndex, ShardNumber};
|
||||
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::page_service::GetActiveTimelineError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::mgr::{GetActiveTenantError, ShardSelector};
|
||||
|
||||
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
pub(crate) trait Types: Sized {
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: Timeline<Self> + Sized;
|
||||
}
|
||||
@@ -307,12 +307,11 @@ impl<T: Types> Default for PerTimelineState<T> {
|
||||
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||
pub(crate) trait TenantManager<T: Types> {
|
||||
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||
/// Errors are returned as [`GetError::TenantManager`].
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||
) -> Result<T::Timeline, GetActiveTimelineError>;
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
@@ -322,13 +321,6 @@ pub(crate) trait Timeline<T: Types> {
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
}
|
||||
|
||||
/// Errors returned by [`Cache::get`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
/// Internal type used in [`Cache::get`].
|
||||
enum RoutingResult<T: Types> {
|
||||
FastPath(Handle<T>),
|
||||
@@ -345,7 +337,7 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
const GET_MAX_RETRIES: usize = 10;
|
||||
const RETRY_BACKOFF: Duration = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
@@ -356,7 +348,11 @@ impl<T: Types> Cache<T> {
|
||||
.await
|
||||
{
|
||||
Ok(handle) => return Ok(handle),
|
||||
Err(e) => {
|
||||
Err(
|
||||
e @ GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout {
|
||||
..
|
||||
}),
|
||||
) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
@@ -370,6 +366,7 @@ impl<T: Types> Cache<T> {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -388,7 +385,7 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let miss: ShardSelector = loop {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
@@ -468,60 +465,50 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||
Ok(timeline) => {
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc =
|
||||
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
{
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced =
|
||||
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
let timeline = tenant_manager.resolve(timeline_id, shard_selector).await?;
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
let Some(per_timeline_state) = &mut *lock_guard else {
|
||||
return Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::ShuttingDown,
|
||||
));
|
||||
};
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -655,7 +642,8 @@ mod tests {
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
|
||||
use utils::shard::ShardCount;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, TenantShardId};
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use super::*;
|
||||
@@ -665,7 +653,6 @@ mod tests {
|
||||
#[derive(Debug)]
|
||||
struct TestTypes;
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Entered;
|
||||
}
|
||||
@@ -716,40 +703,48 @@ mod tests {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Entered> {
|
||||
) -> Result<Entered, GetActiveTimelineError> {
|
||||
fn enter_gate(
|
||||
timeline: &StubTimeline,
|
||||
) -> Result<Arc<GateGuard>, GetActiveTimelineError> {
|
||||
Ok(Arc::new(timeline.gate.enter().map_err(|_| {
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
})?))
|
||||
}
|
||||
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
let enter_gate = || {
|
||||
let gate_guard = timeline.gate.enter()?;
|
||||
let gate_guard = Arc::new(gate_guard);
|
||||
anyhow::Ok(gate_guard)
|
||||
};
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("not found")
|
||||
Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::NotFound {
|
||||
tenant_id: TenantShardId::unsharded(TenantId::from([0; 16])),
|
||||
timeline_id,
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -365,6 +365,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{msg}"
|
||||
);
|
||||
return Err(WalReceiverError::Other(anyhow!(msg)));
|
||||
@@ -382,6 +383,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{msg}"
|
||||
);
|
||||
return Err(WalReceiverError::Other(anyhow!(msg)));
|
||||
@@ -455,6 +457,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{err:?}"
|
||||
);
|
||||
}
|
||||
@@ -586,6 +589,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
remote_consistent_lsn,
|
||||
replytime: ts,
|
||||
shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
|
||||
corruption_detected: timeline
|
||||
.corruption_detected
|
||||
.load(std::sync::atomic::Ordering::Relaxed),
|
||||
};
|
||||
|
||||
debug!("neon_status_update {status_update:?}");
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
use std::backtrace::Backtrace;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
@@ -422,6 +423,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"clear_vm_bits for unknown VM relation {vm_rel}"
|
||||
);
|
||||
return Ok(());
|
||||
@@ -431,6 +434,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
|
||||
);
|
||||
new_vm_blk = None;
|
||||
@@ -441,6 +446,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
|
||||
);
|
||||
old_vm_blk = None;
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_utils.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "communicator.h"
|
||||
|
||||
@@ -673,8 +674,19 @@ lfc_get_state(size_t max_entries)
|
||||
{
|
||||
if (GET_STATE(entry, j) != UNAVAILABLE)
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
n_pages += 1;
|
||||
/* Validate the buffer tag before including it */
|
||||
BufferTag test_tag = entry->key;
|
||||
test_tag.blockNum += j;
|
||||
|
||||
if (BufferTagIsValid(&test_tag))
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
n_pages += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(ERROR, "LFC: Skipping invalid buffer tag during cache state capture: blockNum=%u", test_tag.blockNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (++i == n_entries)
|
||||
@@ -683,7 +695,7 @@ lfc_get_state(size_t max_entries)
|
||||
Assert(i == n_entries);
|
||||
fcs->n_pages = n_pages;
|
||||
Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages (validated)", (int)n_entries, (int)n_pages);
|
||||
}
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -702,6 +714,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
size_t n_entries;
|
||||
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
|
||||
size_t fcs_size;
|
||||
uint32_t max_prefetch_pages;
|
||||
dsm_segment *seg;
|
||||
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
|
||||
|
||||
@@ -746,6 +759,11 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
|
||||
Assert(n_entries != 0);
|
||||
|
||||
max_prefetch_pages = n_entries << fcs_chunk_size_log;
|
||||
if (fcs->n_pages > max_prefetch_pages) {
|
||||
elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages);
|
||||
}
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Do not prewarm more entries than LFC limit */
|
||||
@@ -898,6 +916,11 @@ lfc_prewarm_main(Datum main_arg)
|
||||
{
|
||||
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
|
||||
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
|
||||
|
||||
if (!BufferTagIsValid(&tag)) {
|
||||
elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum);
|
||||
}
|
||||
|
||||
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
|
||||
{
|
||||
(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
@@ -1832,125 +1855,46 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
typedef struct
|
||||
/*
|
||||
* Return metrics about the LFC.
|
||||
*
|
||||
* The return format is a palloc'd array of LfcStatsEntrys. The size
|
||||
* of the returned array is returned in *num_entries.
|
||||
*/
|
||||
LfcStatsEntry *
|
||||
lfc_get_stats(size_t *num_entries)
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
} NeonGetStatsCtx;
|
||||
LfcStatsEntry *entries;
|
||||
size_t n = 0;
|
||||
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
#define MAX_ENTRIES 10
|
||||
entries = palloc(sizeof(LfcStatsEntry) * MAX_ENTRIES);
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
Datum
|
||||
neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
NeonGetStatsCtx *fctx;
|
||||
MemoryContext oldcontext;
|
||||
TupleDesc tupledesc;
|
||||
Datum result;
|
||||
HeapTuple tuple;
|
||||
char const *key;
|
||||
uint64 value = 0;
|
||||
Datum values[NUM_NEON_GET_STATS_COLS];
|
||||
bool nulls[NUM_NEON_GET_STATS_COLS];
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_chunk_size_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_blocks_per_chunk : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_misses", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->misses : 0};
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_hits", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->hits : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_used", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->used : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_writes", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->writes : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_size", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->size : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_used_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->used_pages : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_evicted_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->evicted_pages : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_limit", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->limit : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_chunks_pinned", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->pinned : 0 };
|
||||
Assert(n <= MAX_ENTRIES);
|
||||
#undef MAX_ENTRIES
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
|
||||
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "lfc_key",
|
||||
TEXTOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "lfc_value",
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
|
||||
|
||||
switch (funcctx->call_cntr)
|
||||
{
|
||||
case 0:
|
||||
key = "file_cache_misses";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->misses;
|
||||
break;
|
||||
case 1:
|
||||
key = "file_cache_hits";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->hits;
|
||||
break;
|
||||
case 2:
|
||||
key = "file_cache_used";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used;
|
||||
break;
|
||||
case 3:
|
||||
key = "file_cache_writes";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->writes;
|
||||
break;
|
||||
case 4:
|
||||
key = "file_cache_size";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
case 5:
|
||||
key = "file_cache_used_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used_pages;
|
||||
break;
|
||||
case 6:
|
||||
key = "file_cache_evicted_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->evicted_pages;
|
||||
break;
|
||||
case 7:
|
||||
key = "file_cache_limit";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->limit;
|
||||
break;
|
||||
case 8:
|
||||
key = "file_cache_chunk_size_pages";
|
||||
value = lfc_blocks_per_chunk;
|
||||
break;
|
||||
case 9:
|
||||
key = "file_cache_chunks_pinned";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->pinned;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
values[0] = PointerGetDatum(cstring_to_text(key));
|
||||
nulls[0] = false;
|
||||
if (lfc_ctl)
|
||||
{
|
||||
nulls[1] = false;
|
||||
values[1] = Int64GetDatum(value);
|
||||
}
|
||||
else
|
||||
nulls[1] = true;
|
||||
|
||||
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
*num_entries = n;
|
||||
return entries;
|
||||
}
|
||||
|
||||
|
||||
@@ -1958,193 +1902,86 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
* Function returning data from the local file cache
|
||||
* relation node/tablespace/database/blocknum and access_counter
|
||||
*/
|
||||
PG_FUNCTION_INFO_V1(local_cache_pages);
|
||||
|
||||
/*
|
||||
* Record structure holding the to be exposed cache data.
|
||||
*/
|
||||
typedef struct
|
||||
LocalCachePagesRec *
|
||||
lfc_local_cache_pages(size_t *num_entries)
|
||||
{
|
||||
uint32 pageoffs;
|
||||
Oid relfilenode;
|
||||
Oid reltablespace;
|
||||
Oid reldatabase;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blocknum;
|
||||
uint16 accesscount;
|
||||
} LocalCachePagesRec;
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
size_t n_pages;
|
||||
size_t n;
|
||||
LocalCachePagesRec *result;
|
||||
|
||||
/*
|
||||
* Function context for data persisting over repeated calls.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
LocalCachePagesRec *record;
|
||||
} LocalCachePagesContext;
|
||||
|
||||
|
||||
#define NUM_LOCALCACHE_PAGES_ELEM 7
|
||||
|
||||
Datum
|
||||
local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
MemoryContext oldcontext;
|
||||
LocalCachePagesContext *fctx; /* User function context. */
|
||||
TupleDesc tupledesc;
|
||||
TupleDesc expected_tupledesc;
|
||||
HeapTuple tuple;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
if (!lfc_ctl)
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
uint32 n_pages = 0;
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
if (!LFC_ENABLED())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext));
|
||||
|
||||
/*
|
||||
* To smoothly support upgrades from version 1.0 of this extension
|
||||
* transparently handle the (non-)existence of the pinning_backends
|
||||
* column. We unfortunately have to get the result type for that... -
|
||||
* we can't use the result type determined by the function definition
|
||||
* without potentially crashing when somebody uses the old (or even
|
||||
* wrong) function definition though.
|
||||
*/
|
||||
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
||||
neon_log(ERROR, "return type must be a row type");
|
||||
|
||||
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
|
||||
neon_log(ERROR, "incorrect number of output arguments");
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
|
||||
INT8OID, -1, 0);
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
|
||||
OIDOID, -1, 0);
|
||||
#else
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber",
|
||||
OIDOID, -1, 0);
|
||||
#endif
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
|
||||
INT2OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
|
||||
INT8OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount",
|
||||
INT4OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
|
||||
if (lfc_ctl)
|
||||
/* Count the pages first */
|
||||
n_pages = 0;
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (LFC_ENABLED())
|
||||
if (n_pages == 0)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
result = (LocalCachePagesRec *)
|
||||
MemoryContextAllocHuge(CurrentMemoryContext,
|
||||
sizeof(LocalCachePagesRec) * n_pages);
|
||||
|
||||
/*
|
||||
* Scan through all the cache entries, saving the relevant fields
|
||||
* in the result structure.
|
||||
*/
|
||||
n = 0;
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
{
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
result[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
|
||||
result[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].forknum = entry->key.forkNum;
|
||||
result[n].blocknum = entry->key.blockNum + i;
|
||||
result[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
fctx->record = (LocalCachePagesRec *)
|
||||
MemoryContextAllocHuge(CurrentMemoryContext,
|
||||
sizeof(LocalCachePagesRec) * n_pages);
|
||||
|
||||
/* Set max calls and remember the user function context. */
|
||||
funcctx->max_calls = n_pages;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
if (n_pages != 0)
|
||||
{
|
||||
/*
|
||||
* Scan through all the cache entries, saving the relevant fields
|
||||
* in the fctx->record structure.
|
||||
*/
|
||||
uint32 n = 0;
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
{
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].forknum = entry->key.forkNum;
|
||||
fctx->record[n].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Assert(n_pages == n);
|
||||
}
|
||||
if (lfc_ctl)
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
Assert(n_pages == n);
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = funcctx->user_fctx;
|
||||
|
||||
if (funcctx->call_cntr < funcctx->max_calls)
|
||||
{
|
||||
uint32 i = funcctx->call_cntr;
|
||||
Datum values[NUM_LOCALCACHE_PAGES_ELEM];
|
||||
bool nulls[NUM_LOCALCACHE_PAGES_ELEM] = {
|
||||
false, false, false, false, false, false, false
|
||||
};
|
||||
|
||||
values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs);
|
||||
values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);
|
||||
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
|
||||
values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
|
||||
values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
|
||||
values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
|
||||
values[6] = Int32GetDatum(fctx->record[i].accesscount);
|
||||
|
||||
/* Build and return the tuple. */
|
||||
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
}
|
||||
else
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
*num_entries = n_pages;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Internal implementation of the approximate_working_set_size_seconds()
|
||||
* function.
|
||||
|
||||
@@ -47,6 +47,26 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
|
||||
extern FileCacheState* lfc_get_state(size_t max_entries);
|
||||
extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
|
||||
|
||||
typedef struct LfcStatsEntry
|
||||
{
|
||||
const char *metric_name;
|
||||
bool isnull;
|
||||
uint64 value;
|
||||
} LfcStatsEntry;
|
||||
extern LfcStatsEntry *lfc_get_stats(size_t *num_entries);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32 pageoffs;
|
||||
Oid relfilenode;
|
||||
Oid reltablespace;
|
||||
Oid reldatabase;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blocknum;
|
||||
uint16 accesscount;
|
||||
} LocalCachePagesRec;
|
||||
extern LocalCachePagesRec *lfc_local_cache_pages(size_t *num_entries);
|
||||
|
||||
extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
|
||||
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ char *neon_project_id;
|
||||
char *neon_branch_id;
|
||||
char *neon_endpoint_id;
|
||||
int32 max_cluster_size;
|
||||
char *page_server_connstring;
|
||||
char *pageserver_connstring;
|
||||
char *neon_auth_token;
|
||||
|
||||
int readahead_buffer_size = 128;
|
||||
@@ -1440,7 +1440,6 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
PagestoreShmemInit(void)
|
||||
{
|
||||
@@ -1454,7 +1453,7 @@ PagestoreShmemInit(void)
|
||||
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
|
||||
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
AssignPageserverConnstring(pageserver_connstring, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1473,7 +1472,7 @@ pg_init_libpagestore(void)
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
&page_server_connstring,
|
||||
&pageserver_connstring,
|
||||
"",
|
||||
PGC_SIGHUP,
|
||||
0, /* no flags required */
|
||||
@@ -1644,7 +1643,7 @@ pg_init_libpagestore(void)
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
if (pageserver_connstring[0])
|
||||
{
|
||||
neon_log(PageStoreTrace, "set neon_smgr hook");
|
||||
smgr_hook = smgr_neon;
|
||||
|
||||
112
pgxn/neon/neon.c
112
pgxn/neon/neon.c
@@ -51,6 +51,7 @@ void _PG_init(void);
|
||||
bool lakebase_mode = false;
|
||||
|
||||
static int running_xacts_overflow_policy;
|
||||
static emit_log_hook_type prev_emit_log_hook;
|
||||
static bool monitor_query_exec_time = false;
|
||||
|
||||
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
|
||||
@@ -81,6 +82,8 @@ uint32 WAIT_EVENT_NEON_PS_READ;
|
||||
uint32 WAIT_EVENT_NEON_WAL_DL;
|
||||
#endif
|
||||
|
||||
int databricks_test_hook = 0;
|
||||
|
||||
enum RunningXactsOverflowPolicies {
|
||||
OP_IGNORE,
|
||||
OP_SKIP,
|
||||
@@ -445,6 +448,20 @@ ReportSearchPath(void)
|
||||
static int neon_pgstat_file_size_limit;
|
||||
#endif
|
||||
|
||||
static void DatabricksSqlErrorHookImpl(ErrorData *edata) {
|
||||
if (prev_emit_log_hook != NULL) {
|
||||
prev_emit_log_hook(edata);
|
||||
}
|
||||
|
||||
if (edata->sqlerrcode == ERRCODE_DATA_CORRUPTED) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->data_corruption_count, 1);
|
||||
} else if (edata->sqlerrcode == ERRCODE_INDEX_CORRUPTED) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->index_corruption_count, 1);
|
||||
} else if (edata->sqlerrcode == ERRCODE_INTERNAL_ERROR) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->internal_error_count, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
@@ -456,6 +473,11 @@ _PG_init(void)
|
||||
load_file("$libdir/neon_rmgr", false);
|
||||
#endif
|
||||
|
||||
if (lakebase_mode) {
|
||||
prev_emit_log_hook = emit_log_hook;
|
||||
emit_log_hook = DatabricksSqlErrorHookImpl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initializing a pre-loaded Postgres extension happens in three stages:
|
||||
*
|
||||
@@ -594,6 +616,19 @@ _PG_init(void)
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// A test hook used in sql regress to trigger specific behaviors
|
||||
// to test features easily.
|
||||
DefineCustomIntVariable(
|
||||
"databricks.test_hook",
|
||||
"The test hook used in sql regress tests only",
|
||||
NULL,
|
||||
&databricks_test_hook,
|
||||
0,
|
||||
0, INT32_MAX,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
/*
|
||||
* Important: This must happen after other parts of the extension are
|
||||
* loaded, otherwise any settings to GUCs that were set before the
|
||||
@@ -625,11 +660,15 @@ _PG_init(void)
|
||||
ExecutorEnd_hook = neon_ExecutorEnd;
|
||||
}
|
||||
|
||||
/* Various functions exposed at SQL level */
|
||||
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
PG_FUNCTION_INFO_V1(backpressure_lsns);
|
||||
PG_FUNCTION_INFO_V1(backpressure_throttling_time);
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
PG_FUNCTION_INFO_V1(local_cache_pages);
|
||||
|
||||
Datum
|
||||
pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
@@ -704,6 +743,76 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
|
||||
Datum
|
||||
neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
LfcStatsEntry *entries;
|
||||
size_t num_entries;
|
||||
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* lfc_get_stats() does all the heavy lifting */
|
||||
entries = lfc_get_stats(&num_entries);
|
||||
|
||||
/* Convert the LfcStatsEntrys to a result set */
|
||||
for (size_t i = 0; i < num_entries; i++)
|
||||
{
|
||||
LfcStatsEntry *entry = &entries[i];
|
||||
Datum values[NUM_NEON_GET_STATS_COLS];
|
||||
bool nulls[NUM_NEON_GET_STATS_COLS];
|
||||
|
||||
values[0] = CStringGetTextDatum(entry->metric_name);
|
||||
nulls[0] = false;
|
||||
values[1] = Int64GetDatum(entry->isnull ? 0 : entry->value);
|
||||
nulls[1] = entry->isnull;
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
PG_RETURN_VOID();
|
||||
|
||||
#undef NUM_NEON_GET_STATS_COLS
|
||||
}
|
||||
|
||||
Datum
|
||||
local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#define NUM_LOCALCACHE_PAGES_COLS 7
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
LocalCachePagesRec *entries;
|
||||
size_t num_entries;
|
||||
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* lfc_local_cache_pages() does all the heavy lifting */
|
||||
entries = lfc_local_cache_pages(&num_entries);
|
||||
|
||||
/* Convert the LocalCachePagesRec structs to a result set */
|
||||
for (size_t i = 0; i < num_entries; i++)
|
||||
{
|
||||
LocalCachePagesRec *entry = &entries[i];
|
||||
Datum values[NUM_LOCALCACHE_PAGES_COLS];
|
||||
bool nulls[NUM_LOCALCACHE_PAGES_COLS] = {
|
||||
false, false, false, false, false, false, false
|
||||
};
|
||||
|
||||
values[0] = Int64GetDatum((int64) entry->pageoffs);
|
||||
values[1] = ObjectIdGetDatum(entry->relfilenode);
|
||||
values[2] = ObjectIdGetDatum(entry->reltablespace);
|
||||
values[3] = ObjectIdGetDatum(entry->reldatabase);
|
||||
values[4] = ObjectIdGetDatum(entry->forknum);
|
||||
values[5] = Int64GetDatum((int64) entry->blocknum);
|
||||
values[6] = Int32GetDatum(entry->accesscount);
|
||||
|
||||
/* Build and return the tuple. */
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
|
||||
#undef NUM_LOCALCACHE_PAGES_COLS
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialization stage 2: make requests for the amount of shared memory we
|
||||
* will need.
|
||||
@@ -742,6 +851,9 @@ neon_shmem_startup_hook(void)
|
||||
|
||||
LfcShmemInit();
|
||||
NeonPerfCountersShmemInit();
|
||||
if (lakebase_mode) {
|
||||
DatabricksMetricsShmemInit();
|
||||
}
|
||||
PagestoreShmemInit();
|
||||
RelsizeCacheShmemInit();
|
||||
WalproposerShmemInit();
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user