mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 02:42:56 +00:00
Compare commits
27 Commits
relkind_ca
...
test-proxy
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d3e1024770 | ||
|
|
2c317dad14 | ||
|
|
056056bef0 | ||
|
|
e989e0da78 | ||
|
|
b3c1aecd11 | ||
|
|
1dce2a9e74 | ||
|
|
ca88521653 | ||
|
|
07c3cfd2a0 | ||
|
|
7cd0066212 | ||
|
|
bf3a1529bf | ||
|
|
65d1be6e90 | ||
|
|
16eb8dda3d | ||
|
|
bb32f1b3d0 | ||
|
|
5585c32cee | ||
|
|
0ffdc98e20 | ||
|
|
62d844e657 | ||
|
|
1bb434ab74 | ||
|
|
dbde37c53a | ||
|
|
5e3cb2ab07 | ||
|
|
61f267d8f9 | ||
|
|
e2411818ef | ||
|
|
58327cbba8 | ||
|
|
568927a8a0 | ||
|
|
1ed7252950 | ||
|
|
30b57334ef | ||
|
|
d487ba2b9b | ||
|
|
e7a1d5de94 |
@@ -146,7 +146,9 @@ jobs:
|
||||
with:
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
build-args: |
|
||||
|
||||
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -634,7 +634,9 @@ jobs:
|
||||
DEBIAN_VERSION=bookworm
|
||||
secrets: |
|
||||
SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
@@ -747,7 +749,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
@@ -766,7 +770,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
|
||||
3
.github/workflows/pg-clients.yml
vendored
3
.github/workflows/pg-clients.yml
vendored
@@ -72,9 +72,10 @@ jobs:
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.8
|
||||
image: clickhouse/clickhouse-server:25.6
|
||||
env:
|
||||
CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }}
|
||||
PGSSLCERT: /tmp/postgresql.crt
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
|
||||
69
.github/workflows/proxy-benchmark.yml
vendored
69
.github/workflows/proxy-benchmark.yml
vendored
@@ -13,6 +13,12 @@ on:
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
# - cron: '0 5 * * *' # Runs at 5 UTC once a day
|
||||
workflow_dispatch: # adds an ability to run this manually
|
||||
inputs:
|
||||
commit_hash:
|
||||
type: string
|
||||
description: 'The long neon repo commit hash for the system under test (proxy) to be tested.'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -33,20 +39,14 @@ jobs:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
steps:
|
||||
- name: Checkout proxy-bench Repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: neondatabase/proxy-bench
|
||||
path: proxy-bench
|
||||
|
||||
- name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
|
||||
id: set-env
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -54,19 +54,64 @@ jobs:
|
||||
PROXY_BENCH_PATH=$(realpath ./proxy-bench)
|
||||
{
|
||||
echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH"
|
||||
echo "NEON_DIR=${RUNNER_TEMP}/neon"
|
||||
echo "NEON_PROXY_PATH=${RUNNER_TEMP}/neon/bin/proxy"
|
||||
echo "NEON_DIR=${GITHUB_WORKSPACE}/"
|
||||
echo "NEON_PROXY_PATH=${GITHUB_WORKSPACE}/bin/proxy"
|
||||
echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output"
|
||||
echo "DOCKER_COMPOSE_FILE=${PROXY_BENCH_PATH}/docker-compose.yml"
|
||||
echo ""
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Determine commit hash
|
||||
id: commit_hash
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||
run: |
|
||||
if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
|
||||
COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
|
||||
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
||||
else
|
||||
COMMIT_HASH="${INPUT_COMMIT_HASH}"
|
||||
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Checkout the neon repository at given commit hash
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
ref: ${{ steps.commit_hash.outputs.commit_hash }}
|
||||
|
||||
- name: Print GITHUB_WORKSPACE
|
||||
run: echo "$GITHUB_WORKSPACE"
|
||||
|
||||
- name: List parent dir of workspace
|
||||
run: ls -la /__w/neon
|
||||
|
||||
- name: List all env vars
|
||||
run: env
|
||||
|
||||
- name: Checkout proxy-bench Repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: neondatabase/proxy-bench
|
||||
path: proxy-bench
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: DEBUG List files for debugging
|
||||
working-directory: ${{ env.NEON_DIR }}
|
||||
run: |
|
||||
pwd
|
||||
ls -la
|
||||
|
||||
- name: Install Python deps
|
||||
working-directory: ${{ env.NEON_DIR }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
@@ -77,14 +122,14 @@ jobs:
|
||||
|
||||
- name: Run proxy-bench
|
||||
working-directory: ${{ env.PROXY_BENCH_PATH }}
|
||||
run: ./run.sh --with-grafana --bare-metal
|
||||
run: ./run.sh --bare-metal
|
||||
|
||||
- name: Ingest Bench Results
|
||||
if: always()
|
||||
working-directory: ${{ env.NEON_DIR }}
|
||||
run: |
|
||||
mkdir -p $TEST_OUTPUT
|
||||
python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
|
||||
python3 ./scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
|
||||
|
||||
- name: Push Metrics to Proxy perf database
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -109,4 +154,4 @@ jobs:
|
||||
fi
|
||||
if [[ -d "${PROXY_BENCH_PATH}/test_output" ]]; then
|
||||
rm -rf ${PROXY_BENCH_PATH}/test_output
|
||||
fi
|
||||
fi
|
||||
|
||||
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -5077,8 +5077,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"env_logger",
|
||||
"log",
|
||||
"memoffset 0.9.0",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi_types",
|
||||
@@ -5519,6 +5517,7 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
"zerocopy 0.8.24",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -135,7 +135,6 @@ lock_api = "0.4.13"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.9"
|
||||
moka = { version = "0.12", features = ["sync"] }
|
||||
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
|
||||
# Do not update to >= 7.0.0, at least. The update will have a significant impact
|
||||
@@ -234,9 +233,10 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.8"
|
||||
whoami = "1.5.1"
|
||||
zerocopy = { version = "0.8", features = ["derive", "simd"] }
|
||||
json-structural-diff = { version = "0.2.0" }
|
||||
x509-cert = { version = "0.2.5" }
|
||||
zerocopy = { version = "0.8", features = ["derive", "simd"] }
|
||||
zeroize = "1.8"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.11"
|
||||
|
||||
@@ -103,7 +103,7 @@ RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \
|
||||
&& if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \
|
||||
export CARGO_FEATURES="rest_broker"; \
|
||||
fi \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \
|
||||
--features $CARGO_FEATURES \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
|
||||
@@ -299,6 +299,7 @@ WORKDIR /home/nonroot
|
||||
ENV RUSTC_VERSION=1.88.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG CARGO_AUDITABLE_VERSION=0.7.0
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
ARG CARGO_HAKARI_VERSION=0.9.36
|
||||
ARG CARGO_DENY_VERSION=0.18.2
|
||||
@@ -314,14 +315,16 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --locked --version "${RUSTFILT_VERSION}" && \
|
||||
cargo install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
|
||||
cargo install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
|
||||
cargo install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
|
||||
cargo install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
|
||||
cargo install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
|
||||
cargo install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \
|
||||
cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \
|
||||
cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \
|
||||
cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
|
||||
cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
|
||||
cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
|
||||
cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
|
||||
cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
|
||||
cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb
|
||||
Author: Alexey Masterov <alexey.masterov@databricks.com>
|
||||
Date: Mon Jul 28 18:11:02 2025 +0200
|
||||
|
||||
Patch for pg_repack
|
||||
|
||||
diff --git a/regress/Makefile b/regress/Makefile
|
||||
index bf6edcb..89b4c7f 100644
|
||||
index bf6edcb..110e734 100644
|
||||
--- a/regress/Makefile
|
||||
+++ b/regress/Makefile
|
||||
@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
|
||||
@@ -7,18 +13,36 @@ index bf6edcb..89b4c7f 100644
|
||||
#
|
||||
|
||||
-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
|
||||
+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
|
||||
+REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum
|
||||
|
||||
USE_PGXS = 1 # use pgxs if not in contrib directory
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/expected/init-extension.out
|
||||
+++ b/regress/expected/init-extension.out
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..e7f2363
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/autovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..fc7978e
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/noautovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
|
||||
index 8d0a94e..63b68bf 100644
|
||||
--- a/regress/expected/nosuper.out
|
||||
@@ -50,14 +74,22 @@ index 8d0a94e..63b68bf 100644
|
||||
INFO: repacking table "public.tbl_cluster"
|
||||
ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block
|
||||
DETAIL: query was: RESET lock_timeout
|
||||
diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/sql/init-extension.sql
|
||||
+++ b/regress/sql/init-extension.sql
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..a8eda63
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/autovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..13d4836
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/noautovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
|
||||
index 072f0fa..dbe60f8 100644
|
||||
--- a/regress/sql/nosuper.sql
|
||||
|
||||
@@ -82,6 +82,15 @@ struct Cli {
|
||||
#[arg(long, default_value_t = 3081)]
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// Backwards-compatible --http-port for Hadron deployments. Functionally the
|
||||
/// same as --external-http-port.
|
||||
#[arg(
|
||||
long,
|
||||
conflicts_with = "external_http_port",
|
||||
conflicts_with = "internal_http_port"
|
||||
)]
|
||||
pub http_port: Option<u16>,
|
||||
|
||||
#[arg(short = 'D', long, value_name = "DATADIR")]
|
||||
pub pgdata: String,
|
||||
|
||||
@@ -181,6 +190,26 @@ impl Cli {
|
||||
}
|
||||
}
|
||||
|
||||
// Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port`
|
||||
// arg is used and acts the same as `--external-http-port`. The internal http port is defined
|
||||
// to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so
|
||||
// we need to be careful with the ports to choose.
|
||||
fn get_external_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli.http_port.unwrap_or(cli.external_http_port);
|
||||
}
|
||||
cli.external_http_port
|
||||
}
|
||||
fn get_internal_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli
|
||||
.http_port
|
||||
.map(|p| p + 1)
|
||||
.unwrap_or(cli.internal_http_port);
|
||||
}
|
||||
cli.internal_http_port
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -205,13 +234,18 @@ fn main() -> Result<()> {
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
installed_extensions::initialize_metrics();
|
||||
hadron_metrics::initialize_metrics();
|
||||
if cli.lakebase_mode {
|
||||
installed_extensions::initialize_metrics();
|
||||
hadron_metrics::initialize_metrics();
|
||||
}
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let config = get_config(&cli)?;
|
||||
|
||||
let external_http_port = get_external_http_port(&cli);
|
||||
let internal_http_port = get_internal_http_port(&cli);
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
@@ -220,8 +254,8 @@ fn main() -> Result<()> {
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
external_http_port,
|
||||
internal_http_port,
|
||||
remote_ext_base_url: cli.remote_ext_base_url.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
|
||||
@@ -6,7 +6,8 @@ use compute_api::responses::{
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption,
|
||||
PageserverConnectionInfo, PageserverProtocol, PgIdent, Role,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
@@ -37,7 +38,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
use utils::pid_file;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::configurator::launch_configurator;
|
||||
use crate::disk_quota::set_disk_quota;
|
||||
@@ -248,7 +249,7 @@ pub struct ParsedSpec {
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
/// k8s dns name and port
|
||||
@@ -296,25 +297,47 @@ impl ParsedSpec {
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
|
||||
// Extract the options from the spec file that are needed to connect to
|
||||
// the storage system.
|
||||
//
|
||||
// For backwards-compatibility, the top-level fields in the spec file
|
||||
// may be empty. In that case, we need to dig them from the GUCs in the
|
||||
// cluster.settings field.
|
||||
let pageserver_connstr = spec
|
||||
.pageserver_connstring
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
// In compute specs generated by old control plane versions, the spec file might
|
||||
// be missing the `pageserver_connection_info` field. In that case, we need to dig
|
||||
// the pageserver connection info from the `pageserver_connstr` field instead, or
|
||||
// if that's missing too, from the GUC in the cluster.settings field.
|
||||
let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
|
||||
pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr(
|
||||
pageserver_connstr_field,
|
||||
spec.shard_stripe_size,
|
||||
)?);
|
||||
}
|
||||
}
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
|
||||
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
|
||||
{
|
||||
Some(ShardStripeSize(u32::from_str(&guc)?))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
pageserver_conninfo =
|
||||
Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?);
|
||||
}
|
||||
}
|
||||
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
|
||||
"pageserver connection information should be provided"
|
||||
))?;
|
||||
|
||||
// Similarly for safekeeper connection strings
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
@@ -329,22 +352,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.ok_or("tenant id should be provided")
|
||||
.map(|s| TenantId::from_str(&s))?
|
||||
.or(Err("invalid tenant id"))?
|
||||
.ok_or(anyhow::anyhow!("tenant id should be provided"))?;
|
||||
TenantId::from_str(&guc).context("invalid tenant id")?
|
||||
};
|
||||
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
|
||||
timeline_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.ok_or("timeline id should be provided")
|
||||
.map(|s| TimelineId::from_str(&s))?
|
||||
.or(Err("invalid timeline id"))?
|
||||
.ok_or(anyhow::anyhow!("timeline id should be provided"))?;
|
||||
TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
|
||||
};
|
||||
|
||||
let endpoint_storage_addr: Option<String> = spec
|
||||
@@ -358,7 +381,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
|
||||
let res = ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
pageserver_conninfo,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
@@ -368,7 +391,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
};
|
||||
|
||||
// Now check validity of the parsed specification
|
||||
res.validate()?;
|
||||
res.validate().map_err(anyhow::Error::msg)?;
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
@@ -413,6 +436,66 @@ struct StartVmMonitorResult {
|
||||
vm_monitor: Option<JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
/// This function creates roles that are used by Databricks.
|
||||
/// These roles are not needs to be botostrapped at PG Compute provisioning time.
|
||||
/// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository.
|
||||
pub(crate) fn create_databricks_roles() -> Vec<String> {
|
||||
let roles = vec![
|
||||
// Role for prometheus_stats_exporter
|
||||
Role {
|
||||
name: "databricks_monitor".to_string(),
|
||||
// This uses "local" connection and auth method for that is "trust", so no password is needed.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "IN ROLE pg_monitor".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore control plane
|
||||
Role {
|
||||
name: "databricks_control_plane".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "SUPERUSER".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore httpgateway.
|
||||
Role {
|
||||
name: "databricks_gateway".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
},
|
||||
];
|
||||
|
||||
roles
|
||||
.into_iter()
|
||||
.map(|role| {
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}')
|
||||
THEN
|
||||
CREATE ROLE {} {};
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
role.name,
|
||||
role.name.pg_quote(),
|
||||
role.to_pg_options(),
|
||||
);
|
||||
query
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Databricks-specific environment variables to be passed to the `postgres` sub-process.
|
||||
pub struct DatabricksEnvVars {
|
||||
/// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check
|
||||
@@ -421,14 +504,27 @@ pub struct DatabricksEnvVars {
|
||||
/// Hostname of the Databricks workspace URL this compute instance belongs to.
|
||||
/// Used by postgres to verify Databricks PAT tokens.
|
||||
pub workspace_host: String,
|
||||
|
||||
pub lakebase_mode: bool,
|
||||
}
|
||||
|
||||
impl DatabricksEnvVars {
|
||||
pub fn new(compute_spec: &ComputeSpec, compute_id: Option<&String>) -> Self {
|
||||
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
|
||||
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
|
||||
// Panics if compute_id is not set or not in the expected format.
|
||||
let endpoint_id = compute_id.unwrap().split('/').next().unwrap().to_string();
|
||||
pub fn new(
|
||||
compute_spec: &ComputeSpec,
|
||||
compute_id: Option<&String>,
|
||||
instance_id: Option<String>,
|
||||
lakebase_mode: bool,
|
||||
) -> Self {
|
||||
let endpoint_id = if let Some(instance_id) = instance_id {
|
||||
// Use instance_id as endpoint_id if it is set. This code path is for PuPr model.
|
||||
instance_id
|
||||
} else {
|
||||
// Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model.
|
||||
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
|
||||
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
|
||||
// Panics if compute_id is not set or not in the expected format.
|
||||
compute_id.unwrap().split('/').next().unwrap().to_string()
|
||||
};
|
||||
let workspace_host = compute_spec
|
||||
.databricks_settings
|
||||
.as_ref()
|
||||
@@ -437,6 +533,7 @@ impl DatabricksEnvVars {
|
||||
Self {
|
||||
endpoint_id,
|
||||
workspace_host,
|
||||
lakebase_mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -446,6 +543,10 @@ impl DatabricksEnvVars {
|
||||
|
||||
/// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`.
|
||||
pub fn to_env_var_list(self) -> Vec<(String, String)> {
|
||||
if !self.lakebase_mode {
|
||||
// In neon env, we don't need to pass down the env vars to postgres.
|
||||
return vec![];
|
||||
}
|
||||
vec![
|
||||
(
|
||||
Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(),
|
||||
@@ -495,7 +596,11 @@ impl ComputeNode {
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
if params.lakebase_mode {
|
||||
ComputeNode::set_spec(¶ms, &mut new_state, pspec);
|
||||
} else {
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ComputeNode {
|
||||
@@ -1093,7 +1198,14 @@ impl ComputeNode {
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let pgdata = &self.params.pgdata;
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
fs::create_dir(pgdata)?;
|
||||
if self.params.lakebase_mode {
|
||||
// Ignore creation errors if the directory already exists (e.g. mounting it ahead of time).
|
||||
// If it is something different then PG startup will error out anyway.
|
||||
let _ok = fs::create_dir(pgdata);
|
||||
} else {
|
||||
fs::create_dir(pgdata)?;
|
||||
}
|
||||
|
||||
fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
@@ -1105,12 +1217,10 @@ impl ComputeNode {
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let started = Instant::now();
|
||||
|
||||
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
@@ -1148,23 +1258,20 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
|
||||
0 | 1 => ShardIndex::unsharded(),
|
||||
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
|
||||
let shard0_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: spec.pageserver_conninfo.shard_count,
|
||||
};
|
||||
|
||||
let shard0_url = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Grpc)?
|
||||
.to_owned();
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_connstr,
|
||||
shard0_url,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
shard_index,
|
||||
shard0_index,
|
||||
spec.storage_auth_token.clone(),
|
||||
None, // NB: base backups use payload compression
|
||||
)
|
||||
@@ -1196,7 +1303,9 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
|
||||
/// when the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let shard0_connstr = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Libpq)?;
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
@@ -1283,10 +1392,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1499,16 +1605,8 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(compute_state, lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
self.get_basebackup(compute_state, lsn)
|
||||
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
|
||||
|
||||
if let Some(settings) = databricks_settings {
|
||||
copy_tls_certificates(
|
||||
@@ -1572,7 +1670,7 @@ impl ComputeNode {
|
||||
// symlink doesn't affect anything.
|
||||
//
|
||||
// See https://github.com/neondatabase/autoscaling/issues/800
|
||||
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
|
||||
std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?;
|
||||
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
|
||||
|
||||
match spec.mode {
|
||||
@@ -1587,6 +1685,12 @@ impl ComputeNode {
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
if self.params.lakebase_mode {
|
||||
// We are running in Hadron mode. Disabling this prewarming step for now as it could run
|
||||
// into dblet port conflicts and also doesn't add much value with our current infra.
|
||||
info!("Skipping postgres prewarming in Hadron mode");
|
||||
return Ok(());
|
||||
}
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
@@ -1648,7 +1752,12 @@ impl ComputeNode {
|
||||
let databricks_env_vars = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let spec = &state.pspec.as_ref().unwrap().spec;
|
||||
DatabricksEnvVars::new(spec, Some(&self.params.compute_id))
|
||||
DatabricksEnvVars::new(
|
||||
spec,
|
||||
Some(&self.params.compute_id),
|
||||
self.params.instance_id.clone(),
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
};
|
||||
|
||||
info!(
|
||||
@@ -1820,7 +1929,15 @@ impl ComputeNode {
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
|
||||
if self.params.lakebase_mode {
|
||||
// Set a 2-minute statement_timeout for the session applying config. The individual SQL statements
|
||||
// used in apply_spec_sql() should not take long (they are just creating users and installing
|
||||
// extensions). If any of them are stuck for an extended period of time it usually indicates a
|
||||
// pageserver connectivity problem and we should bail out.
|
||||
conf.options("-c statement_timeout=2min");
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let spec = Arc::new(
|
||||
@@ -2138,7 +2255,17 @@ impl ComputeNode {
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
_ => Path::new(&self.params.pgdata),
|
||||
// BEGIN HADRON
|
||||
// NB: Read core dump files from a fixed location outside of
|
||||
// the data directory since `compute_ctl` wipes the data directory
|
||||
// across container restarts.
|
||||
_ => {
|
||||
if self.params.lakebase_mode {
|
||||
Path::new("/databricks/logs/brickstore")
|
||||
} else {
|
||||
Path::new(&self.params.pgdata)
|
||||
}
|
||||
} // END HADRON
|
||||
};
|
||||
|
||||
// Collect core dump paths if any
|
||||
@@ -2451,7 +2578,7 @@ LIMIT 100",
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2470,7 +2597,7 @@ LIMIT 100",
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2523,22 +2650,22 @@ LIMIT 100",
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
let old_pageserver_conninfo = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.pageserver_conninfo
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
let pageserver_conninfo = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
.pageserver_conninfo;
|
||||
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
@@ -2796,7 +2923,10 @@ mod tests {
|
||||
|
||||
match ParsedSpec::try_from(spec.clone()) {
|
||||
Ok(_p) => panic!("Failed to detect duplicate entry"),
|
||||
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
|
||||
Err(e) => assert!(
|
||||
e.to_string()
|
||||
.starts_with("duplicate entry in safekeeper_connstrings:")
|
||||
),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@ use crate::pg_helpers::{
|
||||
};
|
||||
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
|
||||
|
||||
use utils::shard::{ShardIndex, ShardNumber};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
@@ -69,9 +71,75 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
writeln!(file)?;
|
||||
if let Some(conninfo) = &spec.pageserver_connection_info {
|
||||
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
let num_shards = if conninfo.shard_count.0 == 0 {
|
||||
1 // unsharded, treat it as a single shard
|
||||
} else {
|
||||
conninfo.shard_count.0
|
||||
};
|
||||
|
||||
for shard_number in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: conninfo.shard_count,
|
||||
};
|
||||
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"shard {shard_index} missing from pageserver_connection_info shard map"
|
||||
)
|
||||
})?;
|
||||
|
||||
let first_pageserver = info
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("must have at least one pageserver");
|
||||
|
||||
// Add the libpq URL to the array, or if the URL is missing, reset the array
|
||||
// forgetting any previous entries. All servers must have a libpq URL, or none
|
||||
// at all.
|
||||
if let Some(url) = &first_pageserver.libpq_url {
|
||||
if let Some(ref mut urls) = libpq_urls {
|
||||
urls.push(url.clone());
|
||||
}
|
||||
} else {
|
||||
libpq_urls = None
|
||||
}
|
||||
}
|
||||
if let Some(libpq_urls) = libpq_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"# derived from compute spec's pageserver_conninfo field"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring={}",
|
||||
escape_conf_value(&libpq_urls.join(","))
|
||||
)?;
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_connstring")?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = conninfo.stripe_size {
|
||||
writeln!(
|
||||
file,
|
||||
"# from compute spec's pageserver_conninfo.stripe_size field"
|
||||
)?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
} else {
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "# from compute spec's pageserver_connstring field")?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "# from compute spec's shard_stripe_size field")?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
}
|
||||
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
|
||||
@@ -122,8 +122,11 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
// into the type system.
|
||||
assert_eq!(state.status, ComputeStatus::RefreshConfiguration);
|
||||
|
||||
if state.pspec.as_ref().map(|ps| ps.pageserver_connstr.clone())
|
||||
== Some(pspec.pageserver_connstr.clone())
|
||||
if state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|ps| ps.pageserver_conninfo.clone())
|
||||
== Some(pspec.pageserver_conninfo.clone())
|
||||
{
|
||||
info!(
|
||||
"Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration."
|
||||
|
||||
@@ -4,14 +4,13 @@ use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use itertools::Itertools as _;
|
||||
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let (connstrings, auth) = {
|
||||
let (conninfo, auth) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
(
|
||||
spec.pageserver_connstr.clone(),
|
||||
spec.pageserver_conninfo.clone(),
|
||||
spec.storage_auth_token.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let result =
|
||||
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
@@ -112,35 +110,44 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
/// Tries to acquire LSN leases on all Pageserver shards.
|
||||
fn try_acquire_lsn_lease(
|
||||
connstrings: &str,
|
||||
conninfo: PageserverConnectionInfo,
|
||||
auth: Option<&str>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let connstrings = connstrings.split(',').collect_vec();
|
||||
let shard_count = connstrings.len();
|
||||
let mut leases = Vec::new();
|
||||
|
||||
for (shard_number, &connstring) in connstrings.iter().enumerate() {
|
||||
let tenant_shard_id = match shard_count {
|
||||
0 | 1 => TenantShardId::unsharded(tenant_id),
|
||||
shard_count => TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
},
|
||||
for (shard_index, shard) in conninfo.shards.into_iter() {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: shard_index.shard_number,
|
||||
shard_count: shard_index.shard_count,
|
||||
};
|
||||
|
||||
let lease = match PageserverProtocol::from_connstring(connstring)? {
|
||||
PageserverProtocol::Libpq => {
|
||||
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
PageserverProtocol::Grpc => {
|
||||
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
};
|
||||
leases.push(lease);
|
||||
// XXX: If there are more than pageserver for the one shard, do we need to get a
|
||||
// leas on all of them? Currently, that's what we assume, but this is hypothetical
|
||||
// as of this writing, as we never pass the info for more than one pageserver per
|
||||
// shard.
|
||||
for pageserver in shard.pageservers {
|
||||
let lease = match conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
|
||||
&pageserver.grpc_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
|
||||
&pageserver.libpq_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
};
|
||||
leases.push(lease);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(leases.into_iter().min().flatten())
|
||||
|
||||
@@ -13,17 +13,19 @@ use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles};
|
||||
use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS;
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
|
||||
AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles,
|
||||
CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions,
|
||||
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
|
||||
@@ -166,6 +168,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[DropLogicalSubscriptions].to_vec(),
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -186,15 +189,33 @@ impl ComputeNode {
|
||||
};
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreatePrivilegedRole,
|
||||
let phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
] {
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
]
|
||||
};
|
||||
|
||||
for phase in phases {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
@@ -203,6 +224,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -254,6 +276,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
phases,
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -265,12 +288,28 @@ impl ComputeNode {
|
||||
handle.await??;
|
||||
}
|
||||
|
||||
let mut phases = vec![
|
||||
let mut phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
];
|
||||
]
|
||||
};
|
||||
|
||||
// This step depends on CreateSchemaNeon
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
@@ -303,6 +342,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -328,6 +368,7 @@ impl ComputeNode {
|
||||
concurrency_token: Arc<tokio::sync::Semaphore>,
|
||||
db: DB,
|
||||
subphases: Vec<PerDatabasePhase>,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()> {
|
||||
let _permit = concurrency_token.acquire().await?;
|
||||
|
||||
@@ -355,6 +396,7 @@ impl ComputeNode {
|
||||
let client = client_conn.as_ref().unwrap();
|
||||
Ok(client)
|
||||
},
|
||||
lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -477,6 +519,10 @@ pub enum PerDatabasePhase {
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -489,7 +535,14 @@ pub enum ApplySpecPhase {
|
||||
DisablePostgresDBPgAudit,
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
DropRoles,
|
||||
FinalizeDropLogicalSubscriptions,
|
||||
}
|
||||
@@ -525,6 +578,7 @@ pub async fn apply_operations<'a, Fut, F>(
|
||||
jwks_roles: Arc<HashSet<String>>,
|
||||
apply_spec_phase: ApplySpecPhase,
|
||||
client: F,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
@@ -571,6 +625,23 @@ where
|
||||
},
|
||||
query
|
||||
);
|
||||
if !lakebase_mode {
|
||||
return res;
|
||||
}
|
||||
// BEGIN HADRON
|
||||
if let Err(e) = res.as_ref() {
|
||||
if let Some(sql_state) = e.code() {
|
||||
if sql_state.code() == "57014" {
|
||||
// SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts.
|
||||
// Increment the counter whenever a statement timeout occurs. Timeouts on
|
||||
// this configuration path can only occur due to PS connectivity problems that
|
||||
// Postgres failed to recover from.
|
||||
COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
res
|
||||
}
|
||||
.instrument(inspan)
|
||||
@@ -608,10 +679,44 @@ async fn get_operations<'a>(
|
||||
ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
|
||||
query: format!(
|
||||
include_str!("sql/create_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
privileged_role_name = params.privileged_role_name,
|
||||
privileges = if params.lakebase_mode {
|
||||
"CREATEDB CREATEROLE NOLOGIN BYPASSRLS"
|
||||
} else {
|
||||
"CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS"
|
||||
}
|
||||
),
|
||||
comment: None,
|
||||
}))),
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phase
|
||||
ApplySpecPhase::CreateDatabricksRoles => {
|
||||
let queries = create_databricks_roles();
|
||||
let operations = queries.into_iter().map(|query| Operation {
|
||||
query,
|
||||
comment: None,
|
||||
});
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// Backfill existing databricks_reader_* roles with statement timeout from GUC
|
||||
ApplySpecPhase::AlterDatabricksRoles => {
|
||||
let query = String::from(include_str!(
|
||||
"sql/alter_databricks_reader_roles_timeout.sql"
|
||||
));
|
||||
|
||||
let operations = once(Operation {
|
||||
query,
|
||||
comment: Some(
|
||||
"Backfill existing databricks_reader_* roles with statement timeout"
|
||||
.to_string(),
|
||||
),
|
||||
});
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// End of new Hadron Phase
|
||||
// END_HADRON
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -981,7 +1086,10 @@ async fn get_operations<'a>(
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
)
|
||||
// HADRON change:
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
// HADRON change end ,
|
||||
comment: None,
|
||||
},
|
||||
// This now will only drop privileges of the role
|
||||
@@ -1017,7 +1125,8 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
query: String::from(include_str!("sql/default_grants.sql")),
|
||||
query: String::from(include_str!("sql/default_grants.sql"))
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
comment: None,
|
||||
},
|
||||
]
|
||||
@@ -1086,6 +1195,28 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// BEGIN_HADRON
|
||||
// Note: we may want to version the extension someday, but for now we just drop it and recreate it.
|
||||
ApplySpecPhase::HandleDatabricksAuthExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("DROP EXTENSION IF EXISTS databricks_auth"),
|
||||
comment: Some(String::from("dropping existing databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("CREATE EXTENSION databricks_auth"),
|
||||
comment: Some(String::from("creating databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"),
|
||||
comment: Some(String::from("grant select on databricks auth counters")),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// END_HADRON
|
||||
ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
|
||||
comment: None,
|
||||
@@ -1103,6 +1234,63 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phases
|
||||
//
|
||||
// Grants permissions to roles that are used by Databricks.
|
||||
ApplySpecPhase::AddDatabricksGrants => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute neon.* functions (in the postgres database)",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from("Permissions needed for read and write probes")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to monitor .snap file counts",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to access neon performance counters view",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute the underlying performance counters function",
|
||||
)),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// Creates minor objects that are used by Databricks.
|
||||
ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/create_databricks_misc.sql")),
|
||||
comment: Some(String::from(
|
||||
"The function databricks_monitor uses to convert exception to 0 or 1",
|
||||
)),
|
||||
}))),
|
||||
// End of new Hadron phases
|
||||
// END_HADRON
|
||||
ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
|
||||
comment: None,
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
reader_role RECORD;
|
||||
timeout_value TEXT;
|
||||
BEGIN
|
||||
-- Get the current GUC setting for reader statement timeout
|
||||
SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value;
|
||||
|
||||
-- Only proceed if timeout_value is not null/empty and not '0' (disabled)
|
||||
IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN
|
||||
-- Find all databricks_reader_* roles and update their statement_timeout
|
||||
FOR reader_role IN
|
||||
SELECT r.rolname
|
||||
FROM pg_roles r
|
||||
WHERE r.rolname ~ '^databricks_reader_\d+$'
|
||||
LOOP
|
||||
-- Apply the timeout setting to the role (will overwrite existing setting)
|
||||
EXECUTE format('ALTER ROLE %I SET statement_timeout = %L',
|
||||
reader_role.rolname, timeout_value);
|
||||
|
||||
RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname;
|
||||
END LOOP;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
ALTER ROLE databricks_monitor SET statement_timeout = '60s';
|
||||
|
||||
CREATE OR REPLACE FUNCTION health_check_write_succeeds()
|
||||
RETURNS INTEGER AS $$
|
||||
BEGIN
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();
|
||||
|
||||
RETURN 1;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM;
|
||||
RETURN 0;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -2,7 +2,7 @@ DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
|
||||
THEN
|
||||
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
@@ -19,6 +19,9 @@ use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
|
||||
use control_plane::endpoint::{
|
||||
local_pageserver_conf_to_conn_info, tenant_locate_response_to_conn_info,
|
||||
};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -44,7 +47,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
@@ -52,7 +54,6 @@ use safekeeper_api::{
|
||||
};
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use tokio::task::JoinSet;
|
||||
use url::Host;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -1546,62 +1547,41 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
)?;
|
||||
}
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id).unwrap();
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = futures::future::try_join_all(
|
||||
locate_result.shards.into_iter().map(|shard| async move {
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
// Initialize LSN leases for static computes.
|
||||
assert!(!locate_result.shards.is_empty());
|
||||
|
||||
// Initialize LSN leases for static computes.
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
futures::future::try_join_all(locate_result.shards.iter().map(
|
||||
|shard| async move {
|
||||
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
|
||||
let pageserver = PageServerNode::from_env(env, conf);
|
||||
|
||||
pageserver
|
||||
.http_client
|
||||
.timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
|
||||
.await?;
|
||||
}
|
||||
.await
|
||||
},
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
|
||||
let pageserver = if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
let stripe_size = locate_result.shard_params.stripe_size;
|
||||
|
||||
(pageservers, stripe_size)
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
assert!(!pageservers.is_empty());
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||
@@ -1631,9 +1611,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
pageserver_conninfo,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
start_timeout: args.start_timeout,
|
||||
autoprewarm: args.autoprewarm,
|
||||
@@ -1650,37 +1629,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = match args.pageserver_id {
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = match args.pageserver_id {
|
||||
Some(pageserver_id) => {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(pageserver_id)?);
|
||||
|
||||
vec![(
|
||||
PageserverProtocol::Libpq,
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
let conf = env.get_pageserver_conf(pageserver_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
}
|
||||
None => {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
let locate_result =
|
||||
storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
}
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
endpoint.update_pageservers_in_config(pageservers).await?;
|
||||
endpoint
|
||||
.update_pageservers_in_config(&pageserver_conninfo)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Reconfigure(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
@@ -1688,51 +1659,30 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
vec![pageserver]
|
||||
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
// Use gRPC if requested.
|
||||
if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(&args.safekeepers)?;
|
||||
endpoint
|
||||
.reconfigure(Some(pageservers), None, safekeepers, None)
|
||||
.reconfigure(Some(&pageserver_conninfo), safekeepers, None)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::RefreshConfiguration(args) => {
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
@@ -58,8 +58,12 @@ use compute_api::responses::{
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
||||
PgIdent, RemoteExtSpec, Role,
|
||||
PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
|
||||
};
|
||||
|
||||
// re-export these, because they're used in the reconfigure() function
|
||||
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
|
||||
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
|
||||
@@ -74,9 +78,11 @@ use sha2::{Digest, Sha256};
|
||||
use spki::der::Decode;
|
||||
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::shard::ShardStripeSize;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
|
||||
use postgres_connection::parse_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -387,9 +393,8 @@ pub struct EndpointStartArgs {
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
pub start_timeout: Duration,
|
||||
pub autoprewarm: bool,
|
||||
@@ -662,14 +667,6 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
@@ -715,9 +712,6 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
@@ -732,6 +726,44 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
|
||||
//
|
||||
// XXX: I believe this is not really needed, except to make
|
||||
// test_forward_compatibility happy.
|
||||
//
|
||||
// Use a closure so that we can conviniently return None in the middle of the
|
||||
// loop.
|
||||
let pageserver_connstring: Option<String> = (|| {
|
||||
let num_shards = args.pageserver_conninfo.shard_count.count();
|
||||
let mut connstrings = Vec::new();
|
||||
for shard_no in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: args.pageserver_conninfo.shard_count,
|
||||
shard_number: ShardNumber(shard_no),
|
||||
};
|
||||
let shard = args
|
||||
.pageserver_conninfo
|
||||
.shards
|
||||
.get(&shard_index)
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"shard {} not found in pageserver_connection_info",
|
||||
shard_index
|
||||
)
|
||||
})?;
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow!("must have at least one pageserver"))?;
|
||||
if let Some(libpq_url) = &pageserver.libpq_url {
|
||||
connstrings.push(libpq_url.clone());
|
||||
} else {
|
||||
return Ok::<_, anyhow::Error>(None);
|
||||
}
|
||||
}
|
||||
Ok(Some(connstrings.join(",")))
|
||||
})()?;
|
||||
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
@@ -776,13 +808,14 @@ impl Endpoint {
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
|
||||
pageserver_connstring,
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(args.shard_stripe_size),
|
||||
shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
@@ -966,7 +999,7 @@ impl Endpoint {
|
||||
// Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario.
|
||||
pub async fn update_pageservers_in_config(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pageserver_conninfo: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let mut config: ComputeConfig = {
|
||||
@@ -974,10 +1007,8 @@ impl Endpoint {
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
let mut spec = config.spec.unwrap();
|
||||
spec.pageserver_connstring = Some(pageserver_connstring);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
config.spec = Some(spec);
|
||||
|
||||
let file = std::fs::File::create(&config_path)?;
|
||||
@@ -1020,8 +1051,7 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageserver_conninfo: Option<&PageserverConnectionInfo>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
safekeeper_generation: Option<SafekeeperGeneration>,
|
||||
) -> Result<()> {
|
||||
@@ -1036,15 +1066,15 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If pageservers are not specified, don't change them.
|
||||
if let Some(pageservers) = pageservers {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
if let Some(pageserver_conninfo) = pageserver_conninfo {
|
||||
// If pageservers are provided, we need to ensure that they are not empty.
|
||||
// This is a requirement for the compute_ctl configuration.
|
||||
anyhow::ensure!(
|
||||
!pageserver_conninfo.shards.is_empty(),
|
||||
"no pageservers provided"
|
||||
);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
spec.shard_stripe_size = pageserver_conninfo.stripe_size;
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
@@ -1093,11 +1123,9 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure_pageservers(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageservers: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
||||
.await
|
||||
self.reconfigure(Some(pageservers), None, None).await
|
||||
}
|
||||
|
||||
pub async fn reconfigure_safekeepers(
|
||||
@@ -1105,7 +1133,7 @@ impl Endpoint {
|
||||
safekeepers: Vec<NodeId>,
|
||||
generation: SafekeeperGeneration,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(None, None, Some(safekeepers), Some(generation))
|
||||
self.reconfigure(None, Some(safekeepers), Some(generation))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1188,3 +1216,84 @@ impl Endpoint {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
/// fully managed by storage controller, therefore not sharded.
|
||||
pub fn local_pageserver_conf_to_conn_info(
|
||||
conf: &crate::local_env::PageServerConf,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let libpq_url = {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let ps_conninfo = PageserverShardConnectionInfo {
|
||||
id: Some(conf.id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![ps_conninfo],
|
||||
};
|
||||
|
||||
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: ShardCount::unsharded(),
|
||||
stripe_size: None,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tenant_locate_response_to_conn_info(
|
||||
response: &pageserver_api::controller_api::TenantLocateResponse,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let mut shards = HashMap::new();
|
||||
for shard in response.shards.iter() {
|
||||
tracing::info!("parsing {}", shard.listen_pg_addr);
|
||||
let libpq_url = {
|
||||
let host = &shard.listen_pg_addr;
|
||||
let port = shard.listen_pg_port;
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
|
||||
let host = grpc_addr;
|
||||
let port = shard.listen_grpc_port.expect("no gRPC port");
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: Some(shard.node_id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
}],
|
||||
};
|
||||
|
||||
shards.insert(shard.shard_id.to_index(), shard_info);
|
||||
}
|
||||
|
||||
let stripe_size = if response.shard_params.count.is_unsharded() {
|
||||
None
|
||||
} else {
|
||||
Some(response.shard_params.stripe_size)
|
||||
};
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: response.shard_params.count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -12,8 +12,9 @@ use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::responses::TlsConfig;
|
||||
|
||||
@@ -105,8 +106,27 @@ pub struct ComputeSpec {
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Pageserver information can be passed in three different ways:
|
||||
/// 1. Here in `pageserver_connection_info`
|
||||
/// 2. In the `pageserver_connstring` field.
|
||||
/// 3. in `cluster.settings`.
|
||||
///
|
||||
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
|
||||
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
|
||||
/// `pageserver_connection_info` field is missing.
|
||||
///
|
||||
/// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are
|
||||
/// given, they must contain the same information.
|
||||
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
/// Stripe size for pageserver sharding, in pages. This is set together with the legacy
|
||||
/// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used,
|
||||
/// the stripe size is stored in `pageserver_connection_info.stripe_size` instead.
|
||||
pub shard_stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
@@ -139,10 +159,6 @@ pub struct ComputeSpec {
|
||||
|
||||
pub pgbouncer_settings: Option<IndexMap<String, String>>,
|
||||
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
/// Local Proxy configuration used for JWT authentication
|
||||
#[serde(default)]
|
||||
pub local_proxy_config: Option<LocalProxySpec>,
|
||||
@@ -217,6 +233,140 @@ pub enum ComputeFeature {
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverConnectionInfo {
|
||||
/// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
|
||||
pub shard_count: ShardCount,
|
||||
|
||||
/// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
|
||||
|
||||
/// If the compute supports both protocols, this indicates which one it should use. The compute
|
||||
/// may use other available protocols too, if it doesn't support the preferred one. The URL's
|
||||
/// for the protocol specified here must be present for all shards, i.e. do not mark a protocol
|
||||
/// as preferred if it cannot actually be used with all the pageservers.
|
||||
#[serde(default)]
|
||||
pub prefer_protocol: PageserverProtocol,
|
||||
}
|
||||
|
||||
/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
|
||||
///
|
||||
/// This is used for backwards-compatibility, to parse the legacy
|
||||
/// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays,
|
||||
/// the 'pageserver_connection_info' field should be used instead.
|
||||
impl PageserverConnectionInfo {
|
||||
pub fn from_connstr(
|
||||
connstr: &str,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<PageserverConnectionInfo, anyhow::Error> {
|
||||
let shard_infos: Vec<_> = connstr
|
||||
.split(',')
|
||||
.map(|connstr| PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: None,
|
||||
libpq_url: Some(connstr.to_string()),
|
||||
grpc_url: None,
|
||||
}],
|
||||
})
|
||||
.collect();
|
||||
|
||||
match shard_infos.len() {
|
||||
0 => anyhow::bail!("empty connection string"),
|
||||
1 => {
|
||||
// We assume that if there's only connection string, it means "unsharded",
|
||||
// rather than a sharded system with just a single shard. The latter is
|
||||
// possible in principle, but we never do it.
|
||||
let shard_count = ShardCount::unsharded();
|
||||
let only_shard = shard_infos.first().unwrap().clone();
|
||||
let shards = vec![(ShardIndex::unsharded(), only_shard)];
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size: None,
|
||||
shards: shards.into_iter().collect(),
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
n => {
|
||||
if stripe_size.is_none() {
|
||||
anyhow::bail!("{n} shards but no stripe_size");
|
||||
}
|
||||
let shard_count = ShardCount(n.try_into()?);
|
||||
let shards = shard_infos
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, shard_info)| {
|
||||
(
|
||||
ShardIndex {
|
||||
shard_count,
|
||||
shard_number: ShardNumber(
|
||||
idx.try_into().expect("shard number fits in u8"),
|
||||
),
|
||||
},
|
||||
shard_info,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience routine to get the connection string for a shard.
|
||||
pub fn shard_url(
|
||||
&self,
|
||||
shard_number: ShardNumber,
|
||||
protocol: PageserverProtocol,
|
||||
) -> anyhow::Result<&str> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number,
|
||||
shard_count: self.shard_count,
|
||||
};
|
||||
let shard = self.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!("shard connection info missing for shard {}", shard_index)
|
||||
})?;
|
||||
|
||||
// Just use the first pageserver in the list. That's good enough for this
|
||||
// convenience routine; if you need more control, like round robin policy or
|
||||
// failover support, roll your own. (As of this writing, we never have more than
|
||||
// one pageserver per shard anyway, but that will change in the future.)
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow::anyhow!("must have at least one pageserver"))?;
|
||||
|
||||
let result = match protocol {
|
||||
PageserverProtocol::Grpc => pageserver
|
||||
.grpc_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?,
|
||||
PageserverProtocol::Libpq => pageserver
|
||||
.libpq_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?,
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardInfo {
|
||||
pub pageservers: Vec<PageserverShardConnectionInfo>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardConnectionInfo {
|
||||
pub id: Option<NodeId>,
|
||||
pub libpq_url: Option<String>,
|
||||
pub grpc_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -334,6 +484,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
@@ -470,13 +626,15 @@ pub struct JwksSettings {
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
#[serde(rename = "libpq")]
|
||||
Libpq,
|
||||
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
|
||||
#[serde(rename = "grpc")]
|
||||
Grpc,
|
||||
}
|
||||
|
||||
|
||||
@@ -9,10 +9,7 @@ regex.workspace = true
|
||||
bytes.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
criterion.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
memoffset.workspace = true
|
||||
pprof.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -22,6 +19,7 @@ tracing.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
|
||||
@@ -34,9 +34,8 @@ const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
|
||||
impl ControlFileData {
|
||||
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
|
||||
/// Equivalent to offsetof(ControlFileData, crc) in C.
|
||||
// Someday this can be const when the right compiler features land.
|
||||
fn pg_control_crc_offset() -> usize {
|
||||
memoffset::offset_of!(ControlFileData, crc)
|
||||
const fn pg_control_crc_offset() -> usize {
|
||||
std::mem::offset_of!(ControlFileData, crc)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -4,12 +4,11 @@
|
||||
use crate::pg_constants;
|
||||
use crate::transaction_id_precedes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
use super::bindings::MultiXactId;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
tracing::trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
@@ -14,7 +14,6 @@ use super::xlog_utils::*;
|
||||
use crate::WAL_SEGMENT_SIZE;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::num::NonZeroU32;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -236,7 +235,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder {
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
let next_lsn = if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
tracing::trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64)
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
|
||||
@@ -23,8 +23,6 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::{CString, OsStr};
|
||||
use std::fs::File;
|
||||
@@ -235,7 +233,7 @@ pub fn find_end_of_wal(
|
||||
let mut curr_lsn = start_lsn;
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let pg_version = MY_PGVERSION;
|
||||
debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
|
||||
|
||||
@@ -247,7 +245,7 @@ pub fn find_end_of_wal(
|
||||
match open_wal_segment(&seg_file_path)? {
|
||||
None => {
|
||||
// no more segments
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
|
||||
result, seg_file_path
|
||||
);
|
||||
@@ -260,7 +258,7 @@ pub fn find_end_of_wal(
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
@@ -276,7 +274,7 @@ pub fn find_end_of_wal(
|
||||
match decoder.poll_decode() {
|
||||
Ok(Some(record)) => result = record.0,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
||||
result, e
|
||||
);
|
||||
|
||||
@@ -9,7 +9,7 @@ use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
|
||||
pub use self::sqlstate::*;
|
||||
|
||||
#[allow(clippy::unreadable_literal)]
|
||||
mod sqlstate;
|
||||
pub mod sqlstate;
|
||||
|
||||
/// The severity of a Postgres error or notice.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
|
||||
@@ -34,13 +34,16 @@ macro_rules! critical {
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! critical_timeline {
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) {
|
||||
panic!($($arg)*);
|
||||
}
|
||||
// Increment both metrics
|
||||
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
|
||||
$crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
|
||||
if let Some(c) = $corruption_detected.as_ref() {
|
||||
c.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
|
||||
$tenant_shard_id, $timeline_id, format!($($arg)*));
|
||||
|
||||
@@ -32,6 +32,9 @@ pub struct PageserverFeedback {
|
||||
pub replytime: SystemTime,
|
||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||
pub shard_number: u32,
|
||||
/// If true, the pageserver has detected corruption and the safekeeper and postgres
|
||||
/// should stop sending WAL.
|
||||
pub corruption_detected: bool,
|
||||
}
|
||||
|
||||
impl PageserverFeedback {
|
||||
@@ -43,6 +46,7 @@ impl PageserverFeedback {
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +105,13 @@ impl PageserverFeedback {
|
||||
buf.put_u32(self.shard_number);
|
||||
}
|
||||
|
||||
if self.corruption_detected {
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"corruption_detected\0");
|
||||
buf.put_i32(1);
|
||||
buf.put_u8(1);
|
||||
}
|
||||
|
||||
buf[buf_ptr] = nkeys;
|
||||
}
|
||||
|
||||
@@ -147,6 +158,11 @@ impl PageserverFeedback {
|
||||
assert_eq!(len, 4);
|
||||
rf.shard_number = buf.get_u32();
|
||||
}
|
||||
b"corruption_detected" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 1);
|
||||
rf.corruption_detected = buf.get_u8() != 0;
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
@@ -206,6 +222,26 @@ mod tests {
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
// Test that databricks-specific fields added to the PageserverFeedback message are serialized
|
||||
// and deserialized correctly, in addition to the existing fields from upstream.
|
||||
#[test]
|
||||
fn test_replication_feedback_databricks_fields() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
rf.current_timeline_size = 12345678;
|
||||
rf.last_received_lsn = Lsn(23456789);
|
||||
rf.disk_consistent_lsn = Lsn(34567890);
|
||||
rf.remote_consistent_lsn = Lsn(45678901);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.shard_number = 1;
|
||||
rf.corruption_detected = true;
|
||||
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
|
||||
@@ -59,6 +59,10 @@ impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
|
||||
pub fn unsharded() -> Self {
|
||||
ShardCount(0)
|
||||
}
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
|
||||
@@ -426,6 +426,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
remote_consistent_lsn: 0,
|
||||
replytime: 0,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
};
|
||||
|
||||
let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
|
||||
|
||||
@@ -14,9 +14,9 @@ use utils::logging::warn_slow;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::GetPageSplitter;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
@@ -230,16 +230,14 @@ impl PageserverClient {
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)?
|
||||
{
|
||||
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_id, shard_req) in splitter.drain_requests() {
|
||||
@@ -249,14 +247,10 @@ impl PageserverClient {
|
||||
}
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter
|
||||
.add_response(shard_id, shard_response)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter
|
||||
.get_response()
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
|
||||
/// Fetches pages on the given shard. Does not retry internally.
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
|
||||
@@ -19,7 +19,9 @@ pub mod proto {
|
||||
}
|
||||
|
||||
mod client;
|
||||
pub use client::Client;
|
||||
mod model;
|
||||
mod split;
|
||||
|
||||
pub use client::Client;
|
||||
pub use model::*;
|
||||
pub use split::{GetPageSplitter, SplitError};
|
||||
|
||||
@@ -1,20 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::model::*;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::key_to_shard_number;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardStripeSize};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
requests: HashMap<ShardIndex, GetPageRequest>,
|
||||
/// The response being assembled. Preallocated with empty pages, to be filled in.
|
||||
response: page_api::GetPageResponse,
|
||||
response: GetPageResponse,
|
||||
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
|
||||
/// to assemble the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
@@ -24,22 +23,22 @@ impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
pub fn for_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
req: &GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Option<ShardIndex>> {
|
||||
) -> Result<Option<ShardIndex>, SplitError> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Ok(Some(ShardIndex::unsharded()));
|
||||
}
|
||||
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Find the first page's shard, for comparison.
|
||||
let Some(&first_page) = req.block_numbers.first() else {
|
||||
return Err(anyhow!("no block numbers in request"));
|
||||
return Err("no block numbers in request".into());
|
||||
};
|
||||
let key = rel_block_to_key(req.rel, first_page);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
@@ -57,10 +56,10 @@ impl GetPageSplitter {
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
req: GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Result<Self, SplitError> {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::for_single_shard(&req, count, stripe_size)?.is_none(),
|
||||
@@ -68,10 +67,10 @@ impl GetPageSplitter {
|
||||
);
|
||||
|
||||
if count.is_unsharded() {
|
||||
return Err(anyhow!("unsharded tenant, no point in splitting request"));
|
||||
return Err("unsharded tenant, no point in splitting request".into());
|
||||
}
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Split the requests by shard index.
|
||||
@@ -84,7 +83,7 @@ impl GetPageSplitter {
|
||||
|
||||
requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
.or_insert_with(|| GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
@@ -98,16 +97,16 @@ impl GetPageSplitter {
|
||||
|
||||
// Construct a response to be populated by shard responses. Preallocate empty page slots
|
||||
// with the expected block numbers.
|
||||
let response = page_api::GetPageResponse {
|
||||
let response = GetPageResponse {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
status_code: GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
rel: req.rel,
|
||||
pages: req
|
||||
.block_numbers
|
||||
.into_iter()
|
||||
.map(|block_number| {
|
||||
page_api::Page {
|
||||
Page {
|
||||
block_number,
|
||||
image: Bytes::new(), // empty page slot to be filled in
|
||||
}
|
||||
@@ -123,43 +122,38 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
pub fn drain_requests(&mut self) -> impl Iterator<Item = (ShardIndex, GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard. The response must match the request ID and have an OK
|
||||
/// status code. A response must not already exist for the given shard ID.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> anyhow::Result<()> {
|
||||
response: GetPageResponse,
|
||||
) -> Result<(), SplitError> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
if response.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(anyhow!(
|
||||
if response.status_code != GetPageStatusCode::Ok {
|
||||
return Err(SplitError(format!(
|
||||
"unexpected non-OK response for shard {shard_id}: {} {}",
|
||||
response.status_code,
|
||||
response.reason.unwrap_or_default()
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Place the shard response pages into the assembled response, in request order.
|
||||
@@ -171,26 +165,27 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
let Some(slot) = self.response.pages.get_mut(i) else {
|
||||
return Err(anyhow!("no block_shards slot {i} for shard {shard_id}"));
|
||||
return Err(SplitError(format!(
|
||||
"no block_shards slot {i} for shard {shard_id}"
|
||||
)));
|
||||
};
|
||||
let Some(page) = pages.next() else {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} in shard {shard_id} response",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
};
|
||||
if page.block_number != slot.block_number {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned wrong page at index {i}, expected {} got {}",
|
||||
slot.block_number,
|
||||
page.block_number
|
||||
));
|
||||
slot.block_number, page.block_number
|
||||
)));
|
||||
}
|
||||
if !slot.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned duplicate page {} at index {i}",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
*slot = page;
|
||||
@@ -198,32 +193,54 @@ impl GetPageSplitter {
|
||||
|
||||
// Make sure we've consumed all pages from the shard response.
|
||||
if let Some(extra_page) = pages.next() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned extra page: {}",
|
||||
extra_page.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches the final, assembled response.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn get_response(self) -> anyhow::Result<page_api::GetPageResponse> {
|
||||
/// Collects the final, assembled response.
|
||||
pub fn collect_response(self) -> Result<GetPageResponse, SplitError> {
|
||||
// Check that the response is complete.
|
||||
for (i, page) in self.response.pages.iter().enumerate() {
|
||||
if page.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} for shard {}",
|
||||
page.block_number,
|
||||
self.block_shards
|
||||
.get(i)
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
));
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.response)
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPageSplitter error.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{0}")]
|
||||
pub struct SplitError(String);
|
||||
|
||||
impl From<&str> for SplitError {
|
||||
fn from(err: &str) -> Self {
|
||||
SplitError(err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for SplitError {
|
||||
fn from(err: String) -> Self {
|
||||
SplitError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SplitError> for tonic::Status {
|
||||
fn from(err: SplitError) -> Self {
|
||||
tonic::Status::internal(err.0)
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,8 @@ use anyhow::{Context as _, bail};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, Stream, StreamExt as _};
|
||||
use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -35,8 +36,8 @@ use pageserver_api::pagestream_api::{
|
||||
};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::proto;
|
||||
use pageserver_page_api::{self as page_api, GetPageSplitter};
|
||||
use postgres_backend::{
|
||||
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
|
||||
};
|
||||
@@ -466,13 +467,6 @@ impl TimelineHandles {
|
||||
self.handles
|
||||
.get(timeline_id, shard_selector, &self.wrapper)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
timeline::handle::GetError::TenantManager(e) => e,
|
||||
timeline::handle::GetError::PerTimelineStateShutDown => {
|
||||
trace!("per-timeline state shut down");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_id(&self) -> Option<TenantId> {
|
||||
@@ -488,11 +482,9 @@ pub(crate) struct TenantManagerWrapper {
|
||||
tenant_id: once_cell::sync::OnceCell<TenantId>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TenantManagerTypes;
|
||||
|
||||
impl timeline::handle::Types for TenantManagerTypes {
|
||||
type TenantManagerError = GetActiveTimelineError;
|
||||
type TenantManager = TenantManagerWrapper;
|
||||
type Timeline = TenantManagerCacheItem;
|
||||
}
|
||||
@@ -3432,18 +3424,6 @@ impl GrpcPageServiceHandler {
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
|
||||
/// relations and their sizes, as well as SLRU segments and similar data.
|
||||
#[allow(clippy::result_large_err)]
|
||||
fn ensure_shard_zero(timeline: &Handle<TenantManagerTypes>) -> Result<(), tonic::Status> {
|
||||
match timeline.get_shard_index().shard_number.0 {
|
||||
0 => Ok(()),
|
||||
shard => Err(tonic::Status::invalid_argument(format!(
|
||||
"request must execute on shard zero (is shard {shard})",
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates a PagestreamRequest header from a ReadLsn and request ID.
|
||||
fn make_hdr(
|
||||
read_lsn: page_api::ReadLsn,
|
||||
@@ -3458,30 +3438,72 @@ impl GrpcPageServiceHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request.
|
||||
/// Acquires a timeline handle for the given request. The shard index must match a local shard.
|
||||
///
|
||||
/// TODO: during shard splits, the compute may still be sending requests to the parent shard
|
||||
/// until the entire split is committed and the compute is notified. Consider installing a
|
||||
/// temporary shard router from the parent to the children while the split is in progress.
|
||||
///
|
||||
/// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage
|
||||
/// the TimelineHandles lifecycle.
|
||||
///
|
||||
/// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid
|
||||
/// the unnecessary overhead.
|
||||
/// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`].
|
||||
async fn get_request_timeline(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, GetActiveTimelineError> {
|
||||
let ttid = *extract::<TenantTimelineId>(req);
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
TimelineHandles::new(self.tenant_manager.clone())
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request, which must be for shard zero. Most
|
||||
/// metadata requests are only valid on shard zero.
|
||||
///
|
||||
/// NB: during an ongoing shard split, the compute will keep talking to the parent shard until
|
||||
/// the split is committed, but the parent shard may have been removed in the meanwhile. In that
|
||||
/// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`].
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this child routing.
|
||||
async fn get_request_timeline_shard_zero(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, tonic::Status> {
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
|
||||
if shard_index.shard_number.0 != 0 {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"request only valid on shard zero (requested shard {shard_index})",
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
Ok(timeline) => Ok(timeline),
|
||||
Err(err) => {
|
||||
// We may be in the middle of a shard split. Try to find a child shard 0.
|
||||
if let Ok(timeline) = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await
|
||||
&& timeline.get_shard_index().shard_count > shard_index.shard_count
|
||||
{
|
||||
return Ok(timeline);
|
||||
}
|
||||
Err(err.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start.
|
||||
/// Only errors if the timeline is shutting down.
|
||||
///
|
||||
@@ -3511,28 +3533,22 @@ impl GrpcPageServiceHandler {
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
async fn get_page(
|
||||
ctx: &RequestContext,
|
||||
timeline: &WeakHandle<TenantManagerTypes>,
|
||||
req: proto::GetPageRequest,
|
||||
timeline: Handle<TenantManagerTypes>,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<proto::GetPageResponse, tonic::Status> {
|
||||
let received_at = Instant::now();
|
||||
let timeline = timeline.upgrade()?;
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
span_record!(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
);
|
||||
|
||||
for &blkno in &req.block_numbers {
|
||||
let shard = timeline.get_shard_identity();
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
@@ -3620,7 +3636,89 @@ impl GrpcPageServiceHandler {
|
||||
};
|
||||
}
|
||||
|
||||
Ok(resp.into())
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Processes a GetPage request when there is a potential shard split in progress. We have to
|
||||
/// reroute the request to any local child shards, and split batch requests that straddle
|
||||
/// multiple child shards.
|
||||
///
|
||||
/// Parent shards are split and removed incrementally (there may be many parent shards when
|
||||
/// splitting an already-sharded tenant), but the compute is only notified once the overall
|
||||
/// split commits, which can take several minutes. In the meanwhile, the compute will be sending
|
||||
/// requests to the parent shards.
|
||||
///
|
||||
/// TODO: add test infrastructure to provoke this situation frequently and for long periods of
|
||||
/// time, to properly exercise it.
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this, e.g.:
|
||||
/// * Keep the parent shard until the split commits and the compute is notified.
|
||||
/// * Notify the compute about each subsplit.
|
||||
/// * Return an error that updates the compute's shard map.
|
||||
#[instrument(skip_all)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn maybe_split_get_page(
|
||||
ctx: &RequestContext,
|
||||
handles: &mut TimelineHandles,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
parent: ShardIndex,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
// Check the first page to see if we have any child shards at all. Otherwise, the compute is
|
||||
// just talking to the wrong Pageserver. If the parent has been split, the shard now owning
|
||||
// the page must have a higher shard count.
|
||||
let timeline = handles
|
||||
.get(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let shard_id = timeline.get_shard_identity();
|
||||
if shard_id.count <= parent.shard_count {
|
||||
return Err(HandleUpgradeError::ShutDown.into()); // emulate original error
|
||||
}
|
||||
|
||||
// Fast path: the request fits in a single shard.
|
||||
if let Some(shard_index) =
|
||||
GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))?
|
||||
{
|
||||
// We got the shard ID from the first page, so these must be equal.
|
||||
assert_eq!(shard_index.shard_number, shard_id.number);
|
||||
assert_eq!(shard_index.shard_count, shard_id.count);
|
||||
return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await;
|
||||
}
|
||||
|
||||
// The request spans multiple shards; split it and dispatch parallel requests. All pages
|
||||
// were originally in the parent shard, and during a split all children are local, so we
|
||||
// expect to find local shards for all pages.
|
||||
let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_index, shard_req) in splitter.drain_requests() {
|
||||
let timeline = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await?;
|
||||
let future = Self::get_page(
|
||||
ctx,
|
||||
timeline,
|
||||
shard_req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.map(move |result| result.map(|resp| (shard_index, resp)));
|
||||
shard_requests.push(future);
|
||||
}
|
||||
|
||||
while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_index, shard_response)?;
|
||||
}
|
||||
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3649,11 +3747,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
// to be the sweet spot where throughput is saturated.
|
||||
const CHUNK_SIZE: usize = 256 * 1024;
|
||||
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate the request and decorate the span.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
@@ -3769,11 +3866,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetDbSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetDbSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
|
||||
@@ -3802,14 +3898,29 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<tonic::Streaming<proto::GetPageRequest>>,
|
||||
) -> Result<tonic::Response<Self::GetPagesStream>, tonic::Status> {
|
||||
// Extract the timeline from the request and check that it exists.
|
||||
let ttid = *extract::<TenantTimelineId>(&req);
|
||||
//
|
||||
// NB: during shard splits, the compute may still send requests to the parent shard. We'll
|
||||
// reroute requests to the child shards below, but we also detect the common cases here
|
||||
// where either the shard exists or no shards exist at all. If we have a child shard, we
|
||||
// can't acquire a weak handle because we don't know which child shard to use yet.
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(&req);
|
||||
let shard_index = *extract::<ShardIndex>(&req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?;
|
||||
let timeline = match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
// The timeline shard exists. Keep a weak handle to reuse for each request.
|
||||
Ok(timeline) => Some(timeline.downgrade()),
|
||||
// The shard doesn't exist, but a child shard does. We'll reroute requests later.
|
||||
Err(_) if self.tenant_manager.has_child_shard(tenant_id, shard_index) => None,
|
||||
// Failed to fetch the timeline, and no child shard exists. Error out.
|
||||
Err(err) => return Err(err.into()),
|
||||
};
|
||||
|
||||
// Spawn an IoConcurrency sidecar, if enabled.
|
||||
let gate_guard = self
|
||||
@@ -3826,11 +3937,9 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let mut reqs = req.into_inner();
|
||||
|
||||
let resps = async_stream::try_stream! {
|
||||
let timeline = handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?
|
||||
.downgrade();
|
||||
loop {
|
||||
// Wait for the next client request.
|
||||
//
|
||||
// NB: Tonic considers the entire stream to be an in-flight request and will wait
|
||||
// for it to complete before shutting down. React to cancellation between requests.
|
||||
let req = tokio::select! {
|
||||
@@ -3843,16 +3952,44 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
Err(err) => Err(err),
|
||||
},
|
||||
}?;
|
||||
|
||||
let received_at = Instant::now();
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
|
||||
// Process the request, using a closure to capture errors.
|
||||
let process_request = async || {
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
// Fast path: use the pre-acquired timeline handle.
|
||||
if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) {
|
||||
return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await
|
||||
}
|
||||
|
||||
// The timeline handle is stale. During shard splits, the compute may still be
|
||||
// sending requests to the parent shard. Try to re-route requests to the child
|
||||
// shards, and split any batch requests that straddle multiple child shards.
|
||||
Self::maybe_split_get_page(
|
||||
&ctx,
|
||||
&mut handles,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_index,
|
||||
req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
.await
|
||||
};
|
||||
|
||||
// Return the response. Convert per-request errors to GetPageResponses if
|
||||
// appropriate, or terminate the stream with a tonic::Status.
|
||||
yield match process_request().await {
|
||||
Ok(resp) => resp.into(),
|
||||
Err(status) => {
|
||||
// Log the error, since ObservabilityLayer won't see stream errors.
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
@@ -3872,11 +4009,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetRelSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetRelSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
|
||||
let allow_missing = req.allow_missing;
|
||||
|
||||
@@ -3909,11 +4045,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetSlruSegmentRequest>,
|
||||
) -> Result<tonic::Response<proto::GetSlruSegmentResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
|
||||
@@ -3943,6 +4078,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
&self,
|
||||
req: tonic::Request<proto::LeaseLsnRequest>,
|
||||
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
|
||||
// TODO: this won't work during shard splits, as the request is directed at a specific shard
|
||||
// but the parent shard is removed before the split commits and the compute is notified
|
||||
// (which can take several minutes for large tenants). That's also the case for the libpq
|
||||
// implementation, so we keep the behavior for now.
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
|
||||
@@ -826,6 +826,18 @@ impl TenantManager {
|
||||
peek_slot.is_some()
|
||||
}
|
||||
|
||||
/// Returns whether a local shard exists that's a child of the given tenant shard. Note that
|
||||
/// this just checks for any shard with a larger shard count, and it may not be a direct child
|
||||
/// of the given shard (their keyspace may not overlap).
|
||||
pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool {
|
||||
match &*self.tenants.read().unwrap() {
|
||||
TenantsMap::Initializing => false,
|
||||
TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.any(|(tsid, _)| tsid.shard_count > shard_index.shard_count),
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
@@ -1522,6 +1534,13 @@ impl TenantManager {
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
//
|
||||
// TODO: keeping the parent as InProgress while spawning the children causes read
|
||||
// unavailability, as we can't acquire a new timeline handle for it (existing handles appear
|
||||
// to still work though, even downgraded ones). The parent should be available for reads
|
||||
// until the children are ready -- potentially until *all* subsplits across all parent
|
||||
// shards are complete and the compute has been notified. See:
|
||||
// <https://databricks.atlassian.net/browse/LKB-672>.
|
||||
drop(tenant);
|
||||
let mut parent_slot_guard =
|
||||
self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
|
||||
@@ -397,6 +397,11 @@ pub struct Timeline {
|
||||
/// If true, the last compaction failed.
|
||||
compaction_failed: AtomicBool,
|
||||
|
||||
/// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline.
|
||||
/// We need to feed this information back to the Safekeeper and postgres for them to take the
|
||||
/// appropriate action.
|
||||
corruption_detected: AtomicBool,
|
||||
|
||||
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
|
||||
l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
@@ -3310,6 +3315,7 @@ impl Timeline {
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
compaction_failed: AtomicBool::default(),
|
||||
corruption_detected: AtomicBool::default(),
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
@@ -6004,6 +6010,17 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
// Begin Hadron
|
||||
//
|
||||
fail_point!("create-image-layer-fail-simulated-corruption", |_| {
|
||||
self.corruption_detected
|
||||
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint create-image-layer-fail-simulated-corruption"
|
||||
)))
|
||||
});
|
||||
// End Hadron
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.gate
|
||||
@@ -7149,6 +7166,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"walredo failure during page reconstruction: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1397,6 +1397,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"missing key during compaction: {err:?}"
|
||||
);
|
||||
}
|
||||
@@ -1441,6 +1442,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"could not compact, repartitioning keyspace failed: {e:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -224,11 +224,11 @@ use tracing::{instrument, trace};
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::{ShardIndex, ShardNumber};
|
||||
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::page_service::GetActiveTimelineError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::mgr::{GetActiveTenantError, ShardSelector};
|
||||
|
||||
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
pub(crate) trait Types: Sized {
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: Timeline<Self> + Sized;
|
||||
}
|
||||
@@ -307,12 +307,11 @@ impl<T: Types> Default for PerTimelineState<T> {
|
||||
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||
pub(crate) trait TenantManager<T: Types> {
|
||||
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||
/// Errors are returned as [`GetError::TenantManager`].
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||
) -> Result<T::Timeline, GetActiveTimelineError>;
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
@@ -322,13 +321,6 @@ pub(crate) trait Timeline<T: Types> {
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
}
|
||||
|
||||
/// Errors returned by [`Cache::get`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
/// Internal type used in [`Cache::get`].
|
||||
enum RoutingResult<T: Types> {
|
||||
FastPath(Handle<T>),
|
||||
@@ -345,7 +337,7 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
const GET_MAX_RETRIES: usize = 10;
|
||||
const RETRY_BACKOFF: Duration = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
@@ -356,7 +348,11 @@ impl<T: Types> Cache<T> {
|
||||
.await
|
||||
{
|
||||
Ok(handle) => return Ok(handle),
|
||||
Err(e) => {
|
||||
Err(
|
||||
e @ GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout {
|
||||
..
|
||||
}),
|
||||
) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
@@ -370,6 +366,7 @@ impl<T: Types> Cache<T> {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -388,7 +385,7 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let miss: ShardSelector = loop {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
@@ -468,60 +465,50 @@ impl<T: Types> Cache<T> {
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||
Ok(timeline) => {
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc =
|
||||
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
{
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced =
|
||||
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
) -> Result<Handle<T>, GetActiveTimelineError> {
|
||||
let timeline = tenant_manager.resolve(timeline_id, shard_selector).await?;
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
let Some(per_timeline_state) = &mut *lock_guard else {
|
||||
return Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::ShuttingDown,
|
||||
));
|
||||
};
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -655,7 +642,8 @@ mod tests {
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
|
||||
use utils::shard::ShardCount;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, TenantShardId};
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use super::*;
|
||||
@@ -665,7 +653,6 @@ mod tests {
|
||||
#[derive(Debug)]
|
||||
struct TestTypes;
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Entered;
|
||||
}
|
||||
@@ -716,40 +703,48 @@ mod tests {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Entered> {
|
||||
) -> Result<Entered, GetActiveTimelineError> {
|
||||
fn enter_gate(
|
||||
timeline: &StubTimeline,
|
||||
) -> Result<Arc<GateGuard>, GetActiveTimelineError> {
|
||||
Ok(Arc::new(timeline.gate.enter().map_err(|_| {
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
})?))
|
||||
}
|
||||
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
let enter_gate = || {
|
||||
let gate_guard = timeline.gate.enter()?;
|
||||
let gate_guard = Arc::new(gate_guard);
|
||||
anyhow::Ok(gate_guard)
|
||||
};
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
gate_guard: enter_gate(timeline)?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("not found")
|
||||
Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::NotFound {
|
||||
tenant_id: TenantShardId::unsharded(TenantId::from([0; 16])),
|
||||
timeline_id,
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -365,6 +365,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{msg}"
|
||||
);
|
||||
return Err(WalReceiverError::Other(anyhow!(msg)));
|
||||
@@ -382,6 +383,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{msg}"
|
||||
);
|
||||
return Err(WalReceiverError::Other(anyhow!(msg)));
|
||||
@@ -455,6 +457,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
critical_timeline!(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
Some(&timeline.corruption_detected),
|
||||
"{err:?}"
|
||||
);
|
||||
}
|
||||
@@ -586,6 +589,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
remote_consistent_lsn,
|
||||
replytime: ts,
|
||||
shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
|
||||
corruption_detected: timeline
|
||||
.corruption_detected
|
||||
.load(std::sync::atomic::Ordering::Relaxed),
|
||||
};
|
||||
|
||||
debug!("neon_status_update {status_update:?}");
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
use std::backtrace::Backtrace;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
@@ -422,6 +423,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"clear_vm_bits for unknown VM relation {vm_rel}"
|
||||
);
|
||||
return Ok(());
|
||||
@@ -431,6 +434,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
|
||||
);
|
||||
new_vm_blk = None;
|
||||
@@ -441,6 +446,8 @@ impl WalIngest {
|
||||
critical_timeline!(
|
||||
modification.tline.tenant_shard_id,
|
||||
modification.tline.timeline_id,
|
||||
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
|
||||
None::<&AtomicBool>,
|
||||
"old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
|
||||
);
|
||||
old_vm_blk = None;
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_utils.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "communicator.h"
|
||||
|
||||
@@ -673,8 +674,19 @@ lfc_get_state(size_t max_entries)
|
||||
{
|
||||
if (GET_STATE(entry, j) != UNAVAILABLE)
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
n_pages += 1;
|
||||
/* Validate the buffer tag before including it */
|
||||
BufferTag test_tag = entry->key;
|
||||
test_tag.blockNum += j;
|
||||
|
||||
if (BufferTagIsValid(&test_tag))
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
n_pages += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(ERROR, "LFC: Skipping invalid buffer tag during cache state capture: blockNum=%u", test_tag.blockNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (++i == n_entries)
|
||||
@@ -683,7 +695,7 @@ lfc_get_state(size_t max_entries)
|
||||
Assert(i == n_entries);
|
||||
fcs->n_pages = n_pages;
|
||||
Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages (validated)", (int)n_entries, (int)n_pages);
|
||||
}
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -702,6 +714,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
size_t n_entries;
|
||||
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
|
||||
size_t fcs_size;
|
||||
uint32_t max_prefetch_pages;
|
||||
dsm_segment *seg;
|
||||
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
|
||||
|
||||
@@ -746,6 +759,11 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
|
||||
Assert(n_entries != 0);
|
||||
|
||||
max_prefetch_pages = n_entries << fcs_chunk_size_log;
|
||||
if (fcs->n_pages > max_prefetch_pages) {
|
||||
elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages);
|
||||
}
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Do not prewarm more entries than LFC limit */
|
||||
@@ -898,6 +916,11 @@ lfc_prewarm_main(Datum main_arg)
|
||||
{
|
||||
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
|
||||
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
|
||||
|
||||
if (!BufferTagIsValid(&tag)) {
|
||||
elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum);
|
||||
}
|
||||
|
||||
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
|
||||
{
|
||||
(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
@@ -1832,125 +1855,46 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
typedef struct
|
||||
/*
|
||||
* Return metrics about the LFC.
|
||||
*
|
||||
* The return format is a palloc'd array of LfcStatsEntrys. The size
|
||||
* of the returned array is returned in *num_entries.
|
||||
*/
|
||||
LfcStatsEntry *
|
||||
lfc_get_stats(size_t *num_entries)
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
} NeonGetStatsCtx;
|
||||
LfcStatsEntry *entries;
|
||||
size_t n = 0;
|
||||
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
#define MAX_ENTRIES 10
|
||||
entries = palloc(sizeof(LfcStatsEntry) * MAX_ENTRIES);
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
Datum
|
||||
neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
NeonGetStatsCtx *fctx;
|
||||
MemoryContext oldcontext;
|
||||
TupleDesc tupledesc;
|
||||
Datum result;
|
||||
HeapTuple tuple;
|
||||
char const *key;
|
||||
uint64 value = 0;
|
||||
Datum values[NUM_NEON_GET_STATS_COLS];
|
||||
bool nulls[NUM_NEON_GET_STATS_COLS];
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_chunk_size_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_blocks_per_chunk : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_misses", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->misses : 0};
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_hits", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->hits : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_used", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->used : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_writes", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->writes : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_size", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->size : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_used_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->used_pages : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_evicted_pages", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->evicted_pages : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_limit", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->limit : 0 };
|
||||
entries[n++] = (LfcStatsEntry) {"file_cache_chunks_pinned", lfc_ctl == NULL,
|
||||
lfc_ctl ? lfc_ctl->pinned : 0 };
|
||||
Assert(n <= MAX_ENTRIES);
|
||||
#undef MAX_ENTRIES
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
|
||||
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "lfc_key",
|
||||
TEXTOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "lfc_value",
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
|
||||
|
||||
switch (funcctx->call_cntr)
|
||||
{
|
||||
case 0:
|
||||
key = "file_cache_misses";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->misses;
|
||||
break;
|
||||
case 1:
|
||||
key = "file_cache_hits";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->hits;
|
||||
break;
|
||||
case 2:
|
||||
key = "file_cache_used";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used;
|
||||
break;
|
||||
case 3:
|
||||
key = "file_cache_writes";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->writes;
|
||||
break;
|
||||
case 4:
|
||||
key = "file_cache_size";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
case 5:
|
||||
key = "file_cache_used_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used_pages;
|
||||
break;
|
||||
case 6:
|
||||
key = "file_cache_evicted_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->evicted_pages;
|
||||
break;
|
||||
case 7:
|
||||
key = "file_cache_limit";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->limit;
|
||||
break;
|
||||
case 8:
|
||||
key = "file_cache_chunk_size_pages";
|
||||
value = lfc_blocks_per_chunk;
|
||||
break;
|
||||
case 9:
|
||||
key = "file_cache_chunks_pinned";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->pinned;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
values[0] = PointerGetDatum(cstring_to_text(key));
|
||||
nulls[0] = false;
|
||||
if (lfc_ctl)
|
||||
{
|
||||
nulls[1] = false;
|
||||
values[1] = Int64GetDatum(value);
|
||||
}
|
||||
else
|
||||
nulls[1] = true;
|
||||
|
||||
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
*num_entries = n;
|
||||
return entries;
|
||||
}
|
||||
|
||||
|
||||
@@ -1958,193 +1902,86 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
* Function returning data from the local file cache
|
||||
* relation node/tablespace/database/blocknum and access_counter
|
||||
*/
|
||||
PG_FUNCTION_INFO_V1(local_cache_pages);
|
||||
|
||||
/*
|
||||
* Record structure holding the to be exposed cache data.
|
||||
*/
|
||||
typedef struct
|
||||
LocalCachePagesRec *
|
||||
lfc_local_cache_pages(size_t *num_entries)
|
||||
{
|
||||
uint32 pageoffs;
|
||||
Oid relfilenode;
|
||||
Oid reltablespace;
|
||||
Oid reldatabase;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blocknum;
|
||||
uint16 accesscount;
|
||||
} LocalCachePagesRec;
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
size_t n_pages;
|
||||
size_t n;
|
||||
LocalCachePagesRec *result;
|
||||
|
||||
/*
|
||||
* Function context for data persisting over repeated calls.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
LocalCachePagesRec *record;
|
||||
} LocalCachePagesContext;
|
||||
|
||||
|
||||
#define NUM_LOCALCACHE_PAGES_ELEM 7
|
||||
|
||||
Datum
|
||||
local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
MemoryContext oldcontext;
|
||||
LocalCachePagesContext *fctx; /* User function context. */
|
||||
TupleDesc tupledesc;
|
||||
TupleDesc expected_tupledesc;
|
||||
HeapTuple tuple;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
if (!lfc_ctl)
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
uint32 n_pages = 0;
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
if (!LFC_ENABLED())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext));
|
||||
|
||||
/*
|
||||
* To smoothly support upgrades from version 1.0 of this extension
|
||||
* transparently handle the (non-)existence of the pinning_backends
|
||||
* column. We unfortunately have to get the result type for that... -
|
||||
* we can't use the result type determined by the function definition
|
||||
* without potentially crashing when somebody uses the old (or even
|
||||
* wrong) function definition though.
|
||||
*/
|
||||
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
||||
neon_log(ERROR, "return type must be a row type");
|
||||
|
||||
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
|
||||
neon_log(ERROR, "incorrect number of output arguments");
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
|
||||
INT8OID, -1, 0);
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
|
||||
OIDOID, -1, 0);
|
||||
#else
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber",
|
||||
OIDOID, -1, 0);
|
||||
#endif
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
|
||||
INT2OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
|
||||
INT8OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount",
|
||||
INT4OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
|
||||
if (lfc_ctl)
|
||||
/* Count the pages first */
|
||||
n_pages = 0;
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (LFC_ENABLED())
|
||||
if (n_pages == 0)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
*num_entries = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
result = (LocalCachePagesRec *)
|
||||
MemoryContextAllocHuge(CurrentMemoryContext,
|
||||
sizeof(LocalCachePagesRec) * n_pages);
|
||||
|
||||
/*
|
||||
* Scan through all the cache entries, saving the relevant fields
|
||||
* in the result structure.
|
||||
*/
|
||||
n = 0;
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
{
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
result[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
|
||||
result[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
result[n].forknum = entry->key.forkNum;
|
||||
result[n].blocknum = entry->key.blockNum + i;
|
||||
result[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
fctx->record = (LocalCachePagesRec *)
|
||||
MemoryContextAllocHuge(CurrentMemoryContext,
|
||||
sizeof(LocalCachePagesRec) * n_pages);
|
||||
|
||||
/* Set max calls and remember the user function context. */
|
||||
funcctx->max_calls = n_pages;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
if (n_pages != 0)
|
||||
{
|
||||
/*
|
||||
* Scan through all the cache entries, saving the relevant fields
|
||||
* in the fctx->record structure.
|
||||
*/
|
||||
uint32 n = 0;
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
{
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].forknum = entry->key.forkNum;
|
||||
fctx->record[n].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Assert(n_pages == n);
|
||||
}
|
||||
if (lfc_ctl)
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
Assert(n_pages == n);
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = funcctx->user_fctx;
|
||||
|
||||
if (funcctx->call_cntr < funcctx->max_calls)
|
||||
{
|
||||
uint32 i = funcctx->call_cntr;
|
||||
Datum values[NUM_LOCALCACHE_PAGES_ELEM];
|
||||
bool nulls[NUM_LOCALCACHE_PAGES_ELEM] = {
|
||||
false, false, false, false, false, false, false
|
||||
};
|
||||
|
||||
values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs);
|
||||
values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);
|
||||
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
|
||||
values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
|
||||
values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
|
||||
values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
|
||||
values[6] = Int32GetDatum(fctx->record[i].accesscount);
|
||||
|
||||
/* Build and return the tuple. */
|
||||
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
}
|
||||
else
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
*num_entries = n_pages;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Internal implementation of the approximate_working_set_size_seconds()
|
||||
* function.
|
||||
|
||||
@@ -47,6 +47,26 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
|
||||
extern FileCacheState* lfc_get_state(size_t max_entries);
|
||||
extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
|
||||
|
||||
typedef struct LfcStatsEntry
|
||||
{
|
||||
const char *metric_name;
|
||||
bool isnull;
|
||||
uint64 value;
|
||||
} LfcStatsEntry;
|
||||
extern LfcStatsEntry *lfc_get_stats(size_t *num_entries);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32 pageoffs;
|
||||
Oid relfilenode;
|
||||
Oid reltablespace;
|
||||
Oid reldatabase;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blocknum;
|
||||
uint16 accesscount;
|
||||
} LocalCachePagesRec;
|
||||
extern LocalCachePagesRec *lfc_local_cache_pages(size_t *num_entries);
|
||||
|
||||
extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
|
||||
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ char *neon_project_id;
|
||||
char *neon_branch_id;
|
||||
char *neon_endpoint_id;
|
||||
int32 max_cluster_size;
|
||||
char *page_server_connstring;
|
||||
char *pageserver_connstring;
|
||||
char *neon_auth_token;
|
||||
|
||||
int readahead_buffer_size = 128;
|
||||
@@ -1440,7 +1440,6 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
PagestoreShmemInit(void)
|
||||
{
|
||||
@@ -1454,7 +1453,7 @@ PagestoreShmemInit(void)
|
||||
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
|
||||
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
AssignPageserverConnstring(pageserver_connstring, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1473,7 +1472,7 @@ pg_init_libpagestore(void)
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
&page_server_connstring,
|
||||
&pageserver_connstring,
|
||||
"",
|
||||
PGC_SIGHUP,
|
||||
0, /* no flags required */
|
||||
@@ -1644,7 +1643,7 @@ pg_init_libpagestore(void)
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
if (pageserver_connstring[0])
|
||||
{
|
||||
neon_log(PageStoreTrace, "set neon_smgr hook");
|
||||
smgr_hook = smgr_neon;
|
||||
|
||||
112
pgxn/neon/neon.c
112
pgxn/neon/neon.c
@@ -51,6 +51,7 @@ void _PG_init(void);
|
||||
bool lakebase_mode = false;
|
||||
|
||||
static int running_xacts_overflow_policy;
|
||||
static emit_log_hook_type prev_emit_log_hook;
|
||||
static bool monitor_query_exec_time = false;
|
||||
|
||||
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
|
||||
@@ -81,6 +82,8 @@ uint32 WAIT_EVENT_NEON_PS_READ;
|
||||
uint32 WAIT_EVENT_NEON_WAL_DL;
|
||||
#endif
|
||||
|
||||
int databricks_test_hook = 0;
|
||||
|
||||
enum RunningXactsOverflowPolicies {
|
||||
OP_IGNORE,
|
||||
OP_SKIP,
|
||||
@@ -445,6 +448,20 @@ ReportSearchPath(void)
|
||||
static int neon_pgstat_file_size_limit;
|
||||
#endif
|
||||
|
||||
static void DatabricksSqlErrorHookImpl(ErrorData *edata) {
|
||||
if (prev_emit_log_hook != NULL) {
|
||||
prev_emit_log_hook(edata);
|
||||
}
|
||||
|
||||
if (edata->sqlerrcode == ERRCODE_DATA_CORRUPTED) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->data_corruption_count, 1);
|
||||
} else if (edata->sqlerrcode == ERRCODE_INDEX_CORRUPTED) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->index_corruption_count, 1);
|
||||
} else if (edata->sqlerrcode == ERRCODE_INTERNAL_ERROR) {
|
||||
pg_atomic_fetch_add_u32(&databricks_metrics_shared->internal_error_count, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
@@ -456,6 +473,11 @@ _PG_init(void)
|
||||
load_file("$libdir/neon_rmgr", false);
|
||||
#endif
|
||||
|
||||
if (lakebase_mode) {
|
||||
prev_emit_log_hook = emit_log_hook;
|
||||
emit_log_hook = DatabricksSqlErrorHookImpl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initializing a pre-loaded Postgres extension happens in three stages:
|
||||
*
|
||||
@@ -594,6 +616,19 @@ _PG_init(void)
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// A test hook used in sql regress to trigger specific behaviors
|
||||
// to test features easily.
|
||||
DefineCustomIntVariable(
|
||||
"databricks.test_hook",
|
||||
"The test hook used in sql regress tests only",
|
||||
NULL,
|
||||
&databricks_test_hook,
|
||||
0,
|
||||
0, INT32_MAX,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
/*
|
||||
* Important: This must happen after other parts of the extension are
|
||||
* loaded, otherwise any settings to GUCs that were set before the
|
||||
@@ -625,11 +660,15 @@ _PG_init(void)
|
||||
ExecutorEnd_hook = neon_ExecutorEnd;
|
||||
}
|
||||
|
||||
/* Various functions exposed at SQL level */
|
||||
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
PG_FUNCTION_INFO_V1(backpressure_lsns);
|
||||
PG_FUNCTION_INFO_V1(backpressure_throttling_time);
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
PG_FUNCTION_INFO_V1(local_cache_pages);
|
||||
|
||||
Datum
|
||||
pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
@@ -704,6 +743,76 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
|
||||
Datum
|
||||
neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
LfcStatsEntry *entries;
|
||||
size_t num_entries;
|
||||
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* lfc_get_stats() does all the heavy lifting */
|
||||
entries = lfc_get_stats(&num_entries);
|
||||
|
||||
/* Convert the LfcStatsEntrys to a result set */
|
||||
for (size_t i = 0; i < num_entries; i++)
|
||||
{
|
||||
LfcStatsEntry *entry = &entries[i];
|
||||
Datum values[NUM_NEON_GET_STATS_COLS];
|
||||
bool nulls[NUM_NEON_GET_STATS_COLS];
|
||||
|
||||
values[0] = CStringGetTextDatum(entry->metric_name);
|
||||
nulls[0] = false;
|
||||
values[1] = Int64GetDatum(entry->isnull ? 0 : entry->value);
|
||||
nulls[1] = entry->isnull;
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
PG_RETURN_VOID();
|
||||
|
||||
#undef NUM_NEON_GET_STATS_COLS
|
||||
}
|
||||
|
||||
Datum
|
||||
local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#define NUM_LOCALCACHE_PAGES_COLS 7
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
LocalCachePagesRec *entries;
|
||||
size_t num_entries;
|
||||
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* lfc_local_cache_pages() does all the heavy lifting */
|
||||
entries = lfc_local_cache_pages(&num_entries);
|
||||
|
||||
/* Convert the LocalCachePagesRec structs to a result set */
|
||||
for (size_t i = 0; i < num_entries; i++)
|
||||
{
|
||||
LocalCachePagesRec *entry = &entries[i];
|
||||
Datum values[NUM_LOCALCACHE_PAGES_COLS];
|
||||
bool nulls[NUM_LOCALCACHE_PAGES_COLS] = {
|
||||
false, false, false, false, false, false, false
|
||||
};
|
||||
|
||||
values[0] = Int64GetDatum((int64) entry->pageoffs);
|
||||
values[1] = ObjectIdGetDatum(entry->relfilenode);
|
||||
values[2] = ObjectIdGetDatum(entry->reltablespace);
|
||||
values[3] = ObjectIdGetDatum(entry->reldatabase);
|
||||
values[4] = ObjectIdGetDatum(entry->forknum);
|
||||
values[5] = Int64GetDatum((int64) entry->blocknum);
|
||||
values[6] = Int32GetDatum(entry->accesscount);
|
||||
|
||||
/* Build and return the tuple. */
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
|
||||
#undef NUM_LOCALCACHE_PAGES_COLS
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialization stage 2: make requests for the amount of shared memory we
|
||||
* will need.
|
||||
@@ -742,6 +851,9 @@ neon_shmem_startup_hook(void)
|
||||
|
||||
LfcShmemInit();
|
||||
NeonPerfCountersShmemInit();
|
||||
if (lakebase_mode) {
|
||||
DatabricksMetricsShmemInit();
|
||||
}
|
||||
PagestoreShmemInit();
|
||||
RelsizeCacheShmemInit();
|
||||
WalproposerShmemInit();
|
||||
|
||||
@@ -19,7 +19,36 @@
|
||||
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_pgversioncompat.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
databricks_metrics *databricks_metrics_shared;
|
||||
|
||||
Size
|
||||
DatabricksMetricsShmemSize(void)
|
||||
{
|
||||
return sizeof(databricks_metrics);
|
||||
}
|
||||
|
||||
void
|
||||
DatabricksMetricsShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
|
||||
databricks_metrics_shared =
|
||||
ShmemInitStruct("Databricks counters",
|
||||
DatabricksMetricsShmemSize(),
|
||||
&found);
|
||||
Assert(found == IsUnderPostmaster);
|
||||
if (!found)
|
||||
{
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->index_corruption_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->data_corruption_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->internal_error_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->ps_corruption_detected, 0);
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
|
||||
@@ -38,11 +67,12 @@ NeonPerfCountersShmemRequest(void)
|
||||
#else
|
||||
size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
|
||||
#endif
|
||||
if (lakebase_mode) {
|
||||
size = add_size(size, DatabricksMetricsShmemSize());
|
||||
}
|
||||
RequestAddinShmemSpace(size);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
NeonPerfCountersShmemInit(void)
|
||||
{
|
||||
@@ -395,6 +425,34 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
metric_to_datums(&metrics[i], &values[0], &nulls[0]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
|
||||
if (lakebase_mode) {
|
||||
|
||||
if (databricks_test_hook == TestHookCorruption) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("test corruption")));
|
||||
}
|
||||
|
||||
// Not ideal but piggyback our databricks counters into the neon perf counters view
|
||||
// so that we don't need to introduce neon--1.x+1.sql to add a new view.
|
||||
{
|
||||
metric_t databricks_metrics[] = {
|
||||
{"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)},
|
||||
{"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)},
|
||||
{"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)},
|
||||
{"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)},
|
||||
{NULL, false, 0, 0},
|
||||
};
|
||||
for (int i = 0; databricks_metrics[i].name != NULL; i++)
|
||||
{
|
||||
metric_to_datums(&databricks_metrics[i], &values[0], &nulls[0]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
pfree(metrics);
|
||||
|
||||
return (Datum) 0;
|
||||
|
||||
@@ -177,5 +177,24 @@ extern void inc_query_time(uint64 elapsed);
|
||||
extern Size NeonPerfCountersShmemSize(void);
|
||||
extern void NeonPerfCountersShmemInit(void);
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
typedef struct
|
||||
{
|
||||
pg_atomic_uint32 index_corruption_count;
|
||||
pg_atomic_uint32 data_corruption_count;
|
||||
pg_atomic_uint32 internal_error_count;
|
||||
pg_atomic_uint32 ps_corruption_detected;
|
||||
} databricks_metrics;
|
||||
|
||||
extern databricks_metrics *databricks_metrics_shared;
|
||||
|
||||
extern Size DatabricksMetricsShmemSize(void);
|
||||
extern void DatabricksMetricsShmemInit(void);
|
||||
|
||||
extern int databricks_test_hook;
|
||||
|
||||
static const int TestHookCorruption = 1;
|
||||
/* END_HADRON */
|
||||
|
||||
|
||||
#endif /* NEON_PERF_COUNTERS_H */
|
||||
|
||||
@@ -183,3 +183,22 @@ alloc_curl_handle(void)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Check if a BufferTag is valid by verifying all its fields are not invalid.
|
||||
*/
|
||||
bool
|
||||
BufferTagIsValid(const BufferTag *tag)
|
||||
{
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
return (tag->spcOid != InvalidOid) &&
|
||||
(tag->relNumber != InvalidRelFileNumber) &&
|
||||
(tag->forkNum != InvalidForkNumber) &&
|
||||
(tag->blockNum != InvalidBlockNumber);
|
||||
#else
|
||||
return (tag->rnode.spcNode != InvalidOid) &&
|
||||
(tag->rnode.relNode != InvalidOid) &&
|
||||
(tag->forkNum != InvalidForkNumber) &&
|
||||
(tag->blockNum != InvalidBlockNumber);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define __NEON_UTILS_H__
|
||||
|
||||
#include "lib/stringinfo.h"
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#ifndef WALPROPOSER_LIB
|
||||
#include <curl/curl.h>
|
||||
@@ -16,6 +17,9 @@ void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
void pq_sendint64_le(StringInfo buf, uint64 i);
|
||||
void disable_core_dump(void);
|
||||
|
||||
/* Buffer tag validation function */
|
||||
bool BufferTagIsValid(const BufferTag *tag);
|
||||
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
CURL * alloc_curl_handle(void);
|
||||
|
||||
@@ -236,7 +236,7 @@ extern void prefetch_on_ps_disconnect(void);
|
||||
|
||||
extern page_server_api *page_server;
|
||||
|
||||
extern char *page_server_connstring;
|
||||
extern char *pageserver_connstring;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern char *neon_timeline;
|
||||
|
||||
@@ -1887,6 +1887,12 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
|
||||
psfeedback_log("%u", key, ps_feedback->shard_number);
|
||||
}
|
||||
else if (strcmp(key, "corruption_detected") == 0)
|
||||
{
|
||||
Assert(value_len == 1);
|
||||
ps_feedback->corruption_detected = pq_getmsgbyte(reply_message) != 0;
|
||||
psfeedback_log("%s", key, ps_feedback->corruption_detected ? "true" : "false");
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
|
||||
@@ -374,6 +374,8 @@ typedef struct PageserverFeedback
|
||||
XLogRecPtr remote_consistent_lsn;
|
||||
TimestampTz replytime;
|
||||
uint32 shard_number;
|
||||
/* true if the pageserver has detected data corruption in the timeline */
|
||||
bool corruption_detected;
|
||||
} PageserverFeedback;
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
@@ -741,6 +742,11 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards
|
||||
Assert(ps_feedback->shard_number < MAX_SHARDS);
|
||||
Assert(ps_feedback->shard_number < num_shards);
|
||||
|
||||
// Begin Hadron: Record any corruption signal from the pageserver first.
|
||||
if (ps_feedback->corruption_detected) {
|
||||
pg_atomic_write_u32(&databricks_metrics_shared->ps_corruption_detected, 1);
|
||||
}
|
||||
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
|
||||
// Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive
|
||||
|
||||
@@ -107,6 +107,7 @@ uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
redis.workspace = true
|
||||
zerocopy.workspace = true
|
||||
zeroize.workspace = true
|
||||
# uncomment this to use the real subzero-core crate
|
||||
# subzero-core = { git = "https://github.com/neondatabase/subzero", rev = "396264617e78e8be428682f87469bb25429af88a", features = ["postgresql"], optional = true }
|
||||
# this is a stub for the subzero-core crate
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::auth::{self, AuthFlow};
|
||||
use crate::config::AuthenticationConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::AuthSecret;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::sasl;
|
||||
use crate::stream::{self, Stream};
|
||||
|
||||
@@ -25,13 +25,15 @@ pub(crate) async fn authenticate_cleartext(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
let role = RoleNameInt::from(&info.user);
|
||||
|
||||
let auth_flow = AuthFlow::new(
|
||||
client,
|
||||
auth::CleartextPassword {
|
||||
secret,
|
||||
endpoint: ep,
|
||||
pool: config.thread_pool.clone(),
|
||||
role,
|
||||
pool: config.scram_thread_pool.clone(),
|
||||
},
|
||||
);
|
||||
let auth_outcome = {
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::control_plane::messages::EndpointRateLimitConfig;
|
||||
use crate::control_plane::{
|
||||
self, AccessBlockerFlags, AuthSecret, ControlPlaneApi, EndpointAccessControl, RoleAccessControl,
|
||||
};
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::pqproto::BeMessage;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::proxy::wake_compute::WakeComputeBackend;
|
||||
@@ -273,9 +273,11 @@ async fn authenticate_with_secret(
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
if let Some(password) = unauthenticated_password {
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
let role = RoleNameInt::from(&info.user);
|
||||
|
||||
let auth_outcome =
|
||||
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
|
||||
validate_password_and_exchange(&config.scram_thread_pool, ep, role, &password, secret)
|
||||
.await?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
@@ -499,7 +501,7 @@ mod tests {
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(1),
|
||||
scram_thread_pool: ThreadPool::new(1),
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_vpc_acccess_proxy: false,
|
||||
|
||||
@@ -10,7 +10,7 @@ use super::backend::ComputeCredentialKeys;
|
||||
use super::{AuthError, PasswordHackPayload};
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::AuthSecret;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::pqproto::{BeAuthenticationSaslMessage, BeMessage};
|
||||
use crate::sasl;
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
@@ -46,6 +46,7 @@ pub(crate) struct PasswordHack;
|
||||
pub(crate) struct CleartextPassword {
|
||||
pub(crate) pool: Arc<ThreadPool>,
|
||||
pub(crate) endpoint: EndpointIdInt,
|
||||
pub(crate) role: RoleNameInt,
|
||||
pub(crate) secret: AuthSecret,
|
||||
}
|
||||
|
||||
@@ -111,6 +112,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
let outcome = validate_password_and_exchange(
|
||||
&self.state.pool,
|
||||
self.state.endpoint,
|
||||
self.state.role,
|
||||
password,
|
||||
self.state.secret,
|
||||
)
|
||||
@@ -165,13 +167,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
pub(crate) async fn validate_password_and_exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
role: RoleNameInt,
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
match secret {
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
|
||||
let outcome =
|
||||
crate::scram::exchange(pool, endpoint, role, &scram_secret, password).await?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
|
||||
@@ -29,7 +29,7 @@ use crate::config::{
|
||||
};
|
||||
use crate::control_plane::locks::ApiLocks;
|
||||
use crate::http::health_server::AppMetrics;
|
||||
use crate::metrics::{Metrics, ServiceInfo, ThreadPoolMetrics};
|
||||
use crate::metrics::{Metrics, ServiceInfo};
|
||||
use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo};
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
@@ -114,8 +114,6 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
// TODO: refactor these to use labels
|
||||
debug!("Version: {GIT_VERSION}");
|
||||
debug!("Build_tag: {BUILD_TAG}");
|
||||
@@ -284,7 +282,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
http_config,
|
||||
authentication_config: AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(0),
|
||||
scram_thread_pool: ThreadPool::new(0),
|
||||
scram_protocol_timeout: Duration::from_secs(10),
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_vpc_acccess_proxy: false,
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::project_git_version;
|
||||
use utils::sentry_init::init_sentry;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{Metrics, ServiceInfo, ThreadPoolMetrics};
|
||||
use crate::metrics::{Metrics, ServiceInfo};
|
||||
use crate::pglb::TlsRequired;
|
||||
use crate::pqproto::FeStartupPacket;
|
||||
use crate::protocol2::ConnectionInfo;
|
||||
@@ -80,8 +80,6 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
let args = cli().get_matches();
|
||||
let destination: String = args
|
||||
.get_one::<String>("dest")
|
||||
|
||||
@@ -617,7 +617,12 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
|
||||
Metrics::install(thread_pool.metrics.clone());
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.scram_pool
|
||||
.0
|
||||
.set(thread_pool.metrics.clone())
|
||||
.ok();
|
||||
|
||||
let tls_config = match (&args.tls_key, &args.tls_cert) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
@@ -690,12 +695,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool,
|
||||
scram_thread_pool: thread_pool,
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
ip_allowlist_check_enabled: !args.is_private_access_proxy,
|
||||
is_vpc_acccess_proxy: args.is_private_access_proxy,
|
||||
is_auth_broker: args.is_auth_broker,
|
||||
#[cfg(not(feature = "rest_broker"))]
|
||||
accept_jwts: args.is_auth_broker,
|
||||
#[cfg(feature = "rest_broker")]
|
||||
accept_jwts: args.is_auth_broker || args.is_rest_broker,
|
||||
console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
|
||||
};
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use postgres_client::config::{AuthKeys, ChannelBinding, SslMode};
|
||||
use postgres_client::connect_raw::StartupStream;
|
||||
use postgres_client::error::SqlState;
|
||||
use postgres_client::maybe_tls_stream::MaybeTlsStream;
|
||||
use postgres_client::tls::MakeTlsConnect;
|
||||
use thiserror::Error;
|
||||
@@ -22,7 +23,7 @@ use crate::context::RequestContext;
|
||||
use crate::control_plane::client::ApiLockError;
|
||||
use crate::control_plane::errors::WakeComputeError;
|
||||
use crate::control_plane::messages::MetricsAuxInfo;
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, NumDbConnectionsGuard};
|
||||
use crate::pqproto::StartupMessageParams;
|
||||
use crate::proxy::connect_compute::TlsNegotiation;
|
||||
@@ -65,12 +66,13 @@ impl UserFacingError for PostgresError {
|
||||
}
|
||||
|
||||
impl ReportableError for PostgresError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
PostgresError::Postgres(e) if e.as_db_error().is_some() => {
|
||||
crate::error::ErrorKind::Postgres
|
||||
}
|
||||
PostgresError::Postgres(_) => crate::error::ErrorKind::Compute,
|
||||
PostgresError::Postgres(err) => match err.as_db_error() {
|
||||
Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User,
|
||||
Some(_) => ErrorKind::Postgres,
|
||||
None => ErrorKind::Compute,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -110,9 +112,9 @@ impl UserFacingError for ConnectionError {
|
||||
}
|
||||
|
||||
impl ReportableError for ConnectionError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::TlsError(_) => ErrorKind::Compute,
|
||||
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
|
||||
ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
|
||||
use crate::ext::TaskExt;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig};
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::scram;
|
||||
use crate::serverless::GlobalConnPoolOptions;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
#[cfg(feature = "rest_broker")]
|
||||
@@ -75,7 +75,7 @@ pub struct HttpConfig {
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub thread_pool: Arc<ThreadPool>,
|
||||
pub scram_thread_pool: Arc<scram::threadpool::ThreadPool>,
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub ip_allowlist_check_enabled: bool,
|
||||
pub is_vpc_acccess_proxy: bool,
|
||||
|
||||
@@ -5,6 +5,7 @@ use measured::label::{
|
||||
FixedCardinalitySet, LabelGroupSet, LabelGroupVisitor, LabelName, LabelSet, LabelValue,
|
||||
StaticLabelSet,
|
||||
};
|
||||
use measured::metric::group::Encoding;
|
||||
use measured::metric::histogram::Thresholds;
|
||||
use measured::metric::name::MetricName;
|
||||
use measured::{
|
||||
@@ -18,10 +19,10 @@ use crate::control_plane::messages::ColdStartInfo;
|
||||
use crate::error::ErrorKind;
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
#[metric(new())]
|
||||
pub struct Metrics {
|
||||
#[metric(namespace = "proxy")]
|
||||
#[metric(init = ProxyMetrics::new(thread_pool))]
|
||||
#[metric(init = ProxyMetrics::new())]
|
||||
pub proxy: ProxyMetrics,
|
||||
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
@@ -34,34 +35,27 @@ pub struct Metrics {
|
||||
pub cache: CacheMetrics,
|
||||
}
|
||||
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
impl Metrics {
|
||||
pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
|
||||
let mut metrics = Metrics::new(thread_pool);
|
||||
|
||||
metrics.proxy.errors_total.init_all_dense();
|
||||
metrics.proxy.redis_errors_total.init_all_dense();
|
||||
metrics.proxy.redis_events_count.init_all_dense();
|
||||
metrics.proxy.retries_metric.init_all_dense();
|
||||
metrics.proxy.connection_failures_total.init_all_dense();
|
||||
|
||||
SELF.set(metrics)
|
||||
.ok()
|
||||
.expect("proxy metrics must not be installed more than once");
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub fn get() -> &'static Self {
|
||||
#[cfg(test)]
|
||||
return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
|
||||
#[cfg(not(test))]
|
||||
SELF.get()
|
||||
.expect("proxy metrics must be installed by the main() function")
|
||||
SELF.get_or_init(|| {
|
||||
let mut metrics = Metrics::new();
|
||||
|
||||
metrics.proxy.errors_total.init_all_dense();
|
||||
metrics.proxy.redis_errors_total.init_all_dense();
|
||||
metrics.proxy.redis_events_count.init_all_dense();
|
||||
metrics.proxy.retries_metric.init_all_dense();
|
||||
metrics.proxy.connection_failures_total.init_all_dense();
|
||||
|
||||
metrics
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
#[metric(new())]
|
||||
pub struct ProxyMetrics {
|
||||
#[metric(flatten)]
|
||||
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
|
||||
@@ -134,6 +128,9 @@ pub struct ProxyMetrics {
|
||||
/// Number of TLS handshake failures
|
||||
pub tls_handshake_failures: Counter,
|
||||
|
||||
/// Number of SHA 256 rounds executed.
|
||||
pub sha_rounds: Counter,
|
||||
|
||||
/// HLL approximate cardinality of endpoints that are connecting
|
||||
pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
|
||||
|
||||
@@ -151,8 +148,25 @@ pub struct ProxyMetrics {
|
||||
pub connect_compute_lock: ApiLockMetrics,
|
||||
|
||||
#[metric(namespace = "scram_pool")]
|
||||
#[metric(init = thread_pool)]
|
||||
pub scram_pool: Arc<ThreadPoolMetrics>,
|
||||
pub scram_pool: OnceLockWrapper<Arc<ThreadPoolMetrics>>,
|
||||
}
|
||||
|
||||
/// A Wrapper over [`OnceLock`] to implement [`MetricGroup`].
|
||||
pub struct OnceLockWrapper<T>(pub OnceLock<T>);
|
||||
|
||||
impl<T> Default for OnceLockWrapper<T> {
|
||||
fn default() -> Self {
|
||||
Self(OnceLock::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Enc: Encoding, T: MetricGroup<Enc>> MetricGroup<Enc> for OnceLockWrapper<T> {
|
||||
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
|
||||
if let Some(inner) = self.0.get() {
|
||||
inner.collect_group_into(enc)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
@@ -719,6 +733,7 @@ pub enum CacheKind {
|
||||
ProjectInfoEndpoints,
|
||||
ProjectInfoRoles,
|
||||
Schema,
|
||||
Pbkdf2,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
|
||||
84
proxy/src/scram/cache.rs
Normal file
84
proxy/src/scram/cache.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
use tokio::time::Instant;
|
||||
use zeroize::Zeroize as _;
|
||||
|
||||
use super::pbkdf2;
|
||||
use crate::cache::Cached;
|
||||
use crate::cache::common::{Cache, count_cache_insert, count_cache_outcome, eviction_listener};
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::metrics::{CacheKind, Metrics};
|
||||
|
||||
pub(crate) struct Pbkdf2Cache(moka::sync::Cache<(EndpointIdInt, RoleNameInt), Pbkdf2CacheEntry>);
|
||||
pub(crate) type CachedPbkdf2<'a> = Cached<&'a Pbkdf2Cache>;
|
||||
|
||||
impl Cache for Pbkdf2Cache {
|
||||
type Key = (EndpointIdInt, RoleNameInt);
|
||||
type Value = Pbkdf2CacheEntry;
|
||||
|
||||
fn invalidate(&self, info: &(EndpointIdInt, RoleNameInt)) {
|
||||
self.0.invalidate(info);
|
||||
}
|
||||
}
|
||||
|
||||
/// To speed up password hashing for more active customers, we store the tail results of the
|
||||
/// PBKDF2 algorithm. If the output of PBKDF2 is U1 ^ U2 ^ ⋯ ^ Uc, then we store
|
||||
/// suffix = U17 ^ U18 ^ ⋯ ^ Uc. We only need to calculate U1 ^ U2 ^ ⋯ ^ U15 ^ U16
|
||||
/// to determine the final result.
|
||||
///
|
||||
/// The suffix alone isn't enough to crack the password. The stored_key is still required.
|
||||
/// While both are cached in memory, given they're in different locations is makes it much
|
||||
/// harder to exploit, even if any such memory exploit exists in proxy.
|
||||
#[derive(Clone)]
|
||||
pub struct Pbkdf2CacheEntry {
|
||||
/// corresponds to [`super::ServerSecret::cached_at`]
|
||||
pub(super) cached_from: Instant,
|
||||
pub(super) suffix: pbkdf2::Block,
|
||||
}
|
||||
|
||||
impl Drop for Pbkdf2CacheEntry {
|
||||
fn drop(&mut self) {
|
||||
self.suffix.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl Pbkdf2Cache {
|
||||
pub fn new() -> Self {
|
||||
const SIZE: u64 = 100;
|
||||
const TTL: std::time::Duration = std::time::Duration::from_secs(60);
|
||||
|
||||
let builder = moka::sync::Cache::builder()
|
||||
.name("pbkdf2")
|
||||
.max_capacity(SIZE)
|
||||
// We use time_to_live so we don't refresh the lifetime for an invalid password attempt.
|
||||
.time_to_live(TTL);
|
||||
|
||||
Metrics::get()
|
||||
.cache
|
||||
.capacity
|
||||
.set(CacheKind::Pbkdf2, SIZE as i64);
|
||||
|
||||
let builder =
|
||||
builder.eviction_listener(|_k, _v, cause| eviction_listener(CacheKind::Pbkdf2, cause));
|
||||
|
||||
Self(builder.build())
|
||||
}
|
||||
|
||||
pub fn insert(&self, endpoint: EndpointIdInt, role: RoleNameInt, value: Pbkdf2CacheEntry) {
|
||||
count_cache_insert(CacheKind::Pbkdf2);
|
||||
self.0.insert((endpoint, role), value);
|
||||
}
|
||||
|
||||
fn get(&self, endpoint: EndpointIdInt, role: RoleNameInt) -> Option<Pbkdf2CacheEntry> {
|
||||
count_cache_outcome(CacheKind::Pbkdf2, self.0.get(&(endpoint, role)))
|
||||
}
|
||||
|
||||
pub fn get_entry(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
role: RoleNameInt,
|
||||
) -> Option<CachedPbkdf2<'_>> {
|
||||
self.get(endpoint, role).map(|value| Cached {
|
||||
token: Some((self, (endpoint, role))),
|
||||
value,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -4,10 +4,8 @@ use std::convert::Infallible;
|
||||
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use tracing::{debug, trace};
|
||||
|
||||
use super::ScramKey;
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
@@ -15,8 +13,10 @@ use super::pbkdf2::Pbkdf2;
|
||||
use super::secret::ServerSecret;
|
||||
use super::signature::SignatureBuilder;
|
||||
use super::threadpool::ThreadPool;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use super::{ScramKey, pbkdf2};
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::sasl::{self, ChannelBinding, Error as SaslError};
|
||||
use crate::scram::cache::Pbkdf2CacheEntry;
|
||||
|
||||
/// The only channel binding mode we currently support.
|
||||
#[derive(Debug)]
|
||||
@@ -77,46 +77,113 @@ impl<'a> Exchange<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
|
||||
async fn derive_client_key(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
salt: &[u8],
|
||||
iterations: u32,
|
||||
) -> ScramKey {
|
||||
let salted_password = pool
|
||||
.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
|
||||
.await;
|
||||
|
||||
let make_key = |name| {
|
||||
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
|
||||
.expect("HMAC is able to accept all key sizes")
|
||||
.chain_update(name)
|
||||
.finalize();
|
||||
|
||||
<[u8; 32]>::from(key.into_bytes())
|
||||
};
|
||||
|
||||
make_key(b"Client Key").into()
|
||||
) -> pbkdf2::Block {
|
||||
pool.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
|
||||
.await
|
||||
}
|
||||
|
||||
/// For cleartext flow, we need to derive the client key to
|
||||
/// 1. authenticate the client.
|
||||
/// 2. authenticate with compute.
|
||||
pub(crate) async fn exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
role: RoleNameInt,
|
||||
secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
if secret.iterations > CACHED_ROUNDS {
|
||||
exchange_with_cache(pool, endpoint, role, secret, password).await
|
||||
} else {
|
||||
let salt = BASE64_STANDARD.decode(&*secret.salt_base64)?;
|
||||
let hash = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
|
||||
Ok(validate_pbkdf2(secret, &hash))
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the client key using a cache. We cache the suffix of the pbkdf2 result only,
|
||||
/// which is not enough by itself to perform an offline brute force.
|
||||
async fn exchange_with_cache(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
role: RoleNameInt,
|
||||
secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
let salt = BASE64_STANDARD.decode(&*secret.salt_base64)?;
|
||||
let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
|
||||
|
||||
debug_assert!(
|
||||
secret.iterations > CACHED_ROUNDS,
|
||||
"we should not cache password data if there isn't enough rounds needed"
|
||||
);
|
||||
|
||||
// compute the prefix of the pbkdf2 output.
|
||||
let prefix = derive_client_key(pool, endpoint, password, &salt, CACHED_ROUNDS).await;
|
||||
|
||||
if let Some(entry) = pool.cache.get_entry(endpoint, role) {
|
||||
// hot path: let's check the threadpool cache
|
||||
if secret.cached_at == entry.cached_from {
|
||||
// cache is valid. compute the full hash by adding the prefix to the suffix.
|
||||
let mut hash = prefix;
|
||||
pbkdf2::xor_assign(&mut hash, &entry.suffix);
|
||||
let outcome = validate_pbkdf2(secret, &hash);
|
||||
|
||||
if matches!(outcome, sasl::Outcome::Success(_)) {
|
||||
trace!("password validated from cache");
|
||||
}
|
||||
|
||||
return Ok(outcome);
|
||||
}
|
||||
|
||||
// cached key is no longer valid.
|
||||
debug!("invalidating cached password");
|
||||
entry.invalidate();
|
||||
}
|
||||
|
||||
// slow path: full password hash.
|
||||
let hash = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
|
||||
let outcome = validate_pbkdf2(secret, &hash);
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
sasl::Outcome::Failure(_) => return Ok(outcome),
|
||||
};
|
||||
|
||||
trace!("storing cached password");
|
||||
|
||||
// time to cache, compute the suffix by subtracting the prefix from the hash.
|
||||
let mut suffix = hash;
|
||||
pbkdf2::xor_assign(&mut suffix, &prefix);
|
||||
|
||||
pool.cache.insert(
|
||||
endpoint,
|
||||
role,
|
||||
Pbkdf2CacheEntry {
|
||||
cached_from: secret.cached_at,
|
||||
suffix,
|
||||
},
|
||||
);
|
||||
|
||||
Ok(sasl::Outcome::Success(client_key))
|
||||
}
|
||||
|
||||
fn validate_pbkdf2(secret: &ServerSecret, hash: &pbkdf2::Block) -> sasl::Outcome<ScramKey> {
|
||||
let client_key = super::ScramKey::client_key(&(*hash).into());
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
sasl::Outcome::Failure("password doesn't match")
|
||||
} else {
|
||||
Ok(sasl::Outcome::Success(client_key))
|
||||
sasl::Outcome::Success(client_key)
|
||||
}
|
||||
}
|
||||
|
||||
const CACHED_ROUNDS: u32 = 16;
|
||||
|
||||
impl SaslInitial {
|
||||
fn transition(
|
||||
&self,
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
//! Tools for client/server/stored key management.
|
||||
|
||||
use hmac::Mac as _;
|
||||
use sha2::Digest as _;
|
||||
use subtle::ConstantTimeEq;
|
||||
use zeroize::Zeroize as _;
|
||||
|
||||
use crate::metrics::Metrics;
|
||||
use crate::scram::pbkdf2::Prf;
|
||||
|
||||
/// Faithfully taken from PostgreSQL.
|
||||
pub(crate) const SCRAM_KEY_LEN: usize = 32;
|
||||
@@ -14,6 +20,12 @@ pub(crate) struct ScramKey {
|
||||
bytes: [u8; SCRAM_KEY_LEN],
|
||||
}
|
||||
|
||||
impl Drop for ScramKey {
|
||||
fn drop(&mut self) {
|
||||
self.bytes.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ScramKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.ct_eq(other).into()
|
||||
@@ -28,12 +40,26 @@ impl ConstantTimeEq for ScramKey {
|
||||
|
||||
impl ScramKey {
|
||||
pub(crate) fn sha256(&self) -> Self {
|
||||
super::sha256([self.as_ref()]).into()
|
||||
Metrics::get().proxy.sha_rounds.inc_by(1);
|
||||
Self {
|
||||
bytes: sha2::Sha256::digest(self.as_bytes()).into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] {
|
||||
self.bytes
|
||||
}
|
||||
|
||||
pub(crate) fn client_key(b: &[u8; 32]) -> Self {
|
||||
// Prf::new_from_slice will run 2 sha256 rounds.
|
||||
// Update + Finalize run 2 sha256 rounds.
|
||||
Metrics::get().proxy.sha_rounds.inc_by(4);
|
||||
|
||||
let mut prf = Prf::new_from_slice(b).expect("HMAC is able to accept all key sizes");
|
||||
prf.update(b"Client Key");
|
||||
let client_key: [u8; 32] = prf.finalize().into_bytes().into();
|
||||
client_key.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; SCRAM_KEY_LEN]> for ScramKey {
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
|
||||
|
||||
mod cache;
|
||||
mod countmin;
|
||||
mod exchange;
|
||||
mod key;
|
||||
@@ -18,10 +19,8 @@ pub mod threadpool;
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
pub(crate) use exchange::{Exchange, exchange};
|
||||
use hmac::{Hmac, Mac};
|
||||
pub(crate) use key::ScramKey;
|
||||
pub(crate) use secret::ServerSecret;
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
|
||||
const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
|
||||
@@ -42,29 +41,13 @@ fn base64_decode_array<const N: usize>(input: impl AsRef<[u8]>) -> Option<[u8; N
|
||||
Some(bytes)
|
||||
}
|
||||
|
||||
/// This function essentially is `Hmac(sha256, key, input)`.
|
||||
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2104>.
|
||||
fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("bad key size");
|
||||
parts.into_iter().for_each(|s| mac.update(s));
|
||||
|
||||
mac.finalize().into_bytes().into()
|
||||
}
|
||||
|
||||
fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
let mut hasher = Sha256::new();
|
||||
parts.into_iter().for_each(|s| hasher.update(s));
|
||||
|
||||
hasher.finalize().into()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::threadpool::ThreadPool;
|
||||
use super::{Exchange, ServerSecret};
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
use crate::types::EndpointId;
|
||||
use crate::types::{EndpointId, RoleName};
|
||||
|
||||
#[test]
|
||||
fn snapshot() {
|
||||
@@ -114,23 +97,34 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
async fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let pool = ThreadPool::new(1);
|
||||
|
||||
async fn check(
|
||||
pool: &ThreadPool,
|
||||
scram_secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
) -> Result<(), &'static str> {
|
||||
let ep = EndpointId::from("foo");
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
let role = RoleName::from("user");
|
||||
let role = RoleNameInt::from(&role);
|
||||
|
||||
let scram_secret = ServerSecret::build(server_password).await.unwrap();
|
||||
let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
|
||||
let outcome = super::exchange(pool, ep, role, scram_secret, password)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
match outcome {
|
||||
crate::sasl::Outcome::Success(_) => {}
|
||||
crate::sasl::Outcome::Failure(r) => panic!("{r}"),
|
||||
crate::sasl::Outcome::Success(_) => Ok(()),
|
||||
crate::sasl::Outcome::Failure(r) => Err(r),
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let pool = ThreadPool::new(1);
|
||||
let scram_secret = ServerSecret::build(server_password).await.unwrap();
|
||||
check(&pool, &scram_secret, client_password.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn round_trip() {
|
||||
run_round_trip_test("pencil", "pencil").await;
|
||||
@@ -141,4 +135,27 @@ mod tests {
|
||||
async fn failure() {
|
||||
run_round_trip_test("pencil", "eraser").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[tracing_test::traced_test]
|
||||
async fn password_cache() {
|
||||
let pool = ThreadPool::new(1);
|
||||
let scram_secret = ServerSecret::build("password").await.unwrap();
|
||||
|
||||
// wrong passwords are not added to cache
|
||||
check(&pool, &scram_secret, b"wrong").await.unwrap_err();
|
||||
assert!(!logs_contain("storing cached password"));
|
||||
|
||||
// correct passwords get cached
|
||||
check(&pool, &scram_secret, b"password").await.unwrap();
|
||||
assert!(logs_contain("storing cached password"));
|
||||
|
||||
// wrong passwords do not match the cache
|
||||
check(&pool, &scram_secret, b"wrong").await.unwrap_err();
|
||||
assert!(!logs_contain("password validated from cache"));
|
||||
|
||||
// correct passwords match the cache
|
||||
check(&pool, &scram_secret, b"password").await.unwrap();
|
||||
assert!(logs_contain("password validated from cache"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,50 @@
|
||||
//! For postgres password authentication, we need to perform a PBKDF2 using
|
||||
//! PRF=HMAC-SHA2-256, producing only 1 block (32 bytes) of output key.
|
||||
|
||||
use hmac::Mac as _;
|
||||
use hmac::digest::consts::U32;
|
||||
use hmac::digest::generic_array::GenericArray;
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use zeroize::Zeroize as _;
|
||||
|
||||
use crate::metrics::Metrics;
|
||||
|
||||
/// The Psuedo-random function used during PBKDF2 and the SCRAM-SHA-256 handshake.
|
||||
pub type Prf = hmac::Hmac<sha2::Sha256>;
|
||||
pub(crate) type Block = GenericArray<u8, U32>;
|
||||
|
||||
pub(crate) struct Pbkdf2 {
|
||||
hmac: Hmac<Sha256>,
|
||||
prev: GenericArray<u8, U32>,
|
||||
hi: GenericArray<u8, U32>,
|
||||
hmac: Prf,
|
||||
/// U{r-1} for whatever iteration r we are currently on.
|
||||
prev: Block,
|
||||
/// the output of `fold(xor, U{1}..U{r})` for whatever iteration r we are currently on.
|
||||
hi: Block,
|
||||
/// number of iterations left
|
||||
iterations: u32,
|
||||
}
|
||||
|
||||
impl Drop for Pbkdf2 {
|
||||
fn drop(&mut self) {
|
||||
self.prev.zeroize();
|
||||
self.hi.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
impl Pbkdf2 {
|
||||
pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
|
||||
pub(crate) fn start(pw: &[u8], salt: &[u8], iterations: u32) -> Self {
|
||||
// key the HMAC and derive the first block in-place
|
||||
let mut hmac =
|
||||
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
let mut hmac = Prf::new_from_slice(pw).expect("HMAC is able to accept all key sizes");
|
||||
|
||||
// U1 = PRF(Password, Salt + INT_32_BE(i))
|
||||
// i = 1 since we only need 1 block of output.
|
||||
hmac.update(salt);
|
||||
hmac.update(&1u32.to_be_bytes());
|
||||
let init_block = hmac.finalize_reset().into_bytes();
|
||||
|
||||
// Prf::new_from_slice will run 2 sha256 rounds.
|
||||
// Our update + finalize run 2 sha256 rounds for each pbkdf2 round.
|
||||
Metrics::get().proxy.sha_rounds.inc_by(4);
|
||||
|
||||
Self {
|
||||
hmac,
|
||||
// one iteration spent above
|
||||
@@ -33,7 +58,11 @@ impl Pbkdf2 {
|
||||
(self.iterations).clamp(0, 4096)
|
||||
}
|
||||
|
||||
pub(crate) fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
|
||||
/// For "fairness", we implement PBKDF2 with cooperative yielding, which is why we use this `turn`
|
||||
/// function that only executes a fixed number of iterations before continuing.
|
||||
///
|
||||
/// Task must be rescheuled if this returns [`std::task::Poll::Pending`].
|
||||
pub(crate) fn turn(&mut self) -> std::task::Poll<Block> {
|
||||
let Self {
|
||||
hmac,
|
||||
prev,
|
||||
@@ -44,25 +73,37 @@ impl Pbkdf2 {
|
||||
// only do up to 4096 iterations per turn for fairness
|
||||
let n = (*iterations).clamp(0, 4096);
|
||||
for _ in 0..n {
|
||||
hmac.update(prev);
|
||||
let block = hmac.finalize_reset().into_bytes();
|
||||
|
||||
for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
|
||||
*hi_byte ^= b;
|
||||
}
|
||||
|
||||
*prev = block;
|
||||
let next = single_round(hmac, prev);
|
||||
xor_assign(hi, &next);
|
||||
*prev = next;
|
||||
}
|
||||
|
||||
// Our update + finalize run 2 sha256 rounds for each pbkdf2 round.
|
||||
Metrics::get().proxy.sha_rounds.inc_by(2 * n as u64);
|
||||
|
||||
*iterations -= n;
|
||||
if *iterations == 0 {
|
||||
std::task::Poll::Ready((*hi).into())
|
||||
std::task::Poll::Ready(*hi)
|
||||
} else {
|
||||
std::task::Poll::Pending
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn xor_assign(x: &mut Block, y: &Block) {
|
||||
for (x, &y) in std::iter::zip(x, y) {
|
||||
*x ^= y;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn single_round(prf: &mut Prf, ui: &Block) -> Block {
|
||||
// Ui = PRF(Password, Ui-1)
|
||||
prf.update(ui);
|
||||
prf.finalize_reset().into_bytes()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pbkdf2::pbkdf2_hmac_array;
|
||||
@@ -76,11 +117,11 @@ mod tests {
|
||||
let pass = b"Ne0n_!5_50_C007";
|
||||
|
||||
let mut job = Pbkdf2::start(pass, salt, 60000);
|
||||
let hash = loop {
|
||||
let hash: [u8; 32] = loop {
|
||||
let std::task::Poll::Ready(hash) = job.turn() else {
|
||||
continue;
|
||||
};
|
||||
break hash;
|
||||
break hash.into();
|
||||
};
|
||||
|
||||
let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 60000);
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use subtle::{Choice, ConstantTimeEq};
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::base64_decode_array;
|
||||
use super::key::ScramKey;
|
||||
@@ -11,6 +12,9 @@ use super::key::ScramKey;
|
||||
/// and is used throughout the authentication process.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub(crate) struct ServerSecret {
|
||||
/// When this secret was cached.
|
||||
pub(crate) cached_at: Instant,
|
||||
|
||||
/// Number of iterations for `PBKDF2` function.
|
||||
pub(crate) iterations: u32,
|
||||
/// Salt used to hash user's password.
|
||||
@@ -34,6 +38,7 @@ impl ServerSecret {
|
||||
params.split_once(':').zip(keys.split_once(':'))?;
|
||||
|
||||
let secret = ServerSecret {
|
||||
cached_at: Instant::now(),
|
||||
iterations: iterations.parse().ok()?,
|
||||
salt_base64: salt.into(),
|
||||
stored_key: base64_decode_array(stored_key)?.into(),
|
||||
@@ -54,6 +59,7 @@ impl ServerSecret {
|
||||
/// See `auth-scram.c : mock_scram_secret` for details.
|
||||
pub(crate) fn mock(nonce: [u8; 32]) -> Self {
|
||||
Self {
|
||||
cached_at: Instant::now(),
|
||||
// this doesn't reveal much information as we're going to use
|
||||
// iteration count 1 for our generated passwords going forward.
|
||||
// PG16 users can set iteration count=1 already today.
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
//! Tools for client/server signature management.
|
||||
|
||||
use hmac::Mac as _;
|
||||
|
||||
use super::key::{SCRAM_KEY_LEN, ScramKey};
|
||||
use crate::metrics::Metrics;
|
||||
use crate::scram::pbkdf2::Prf;
|
||||
|
||||
/// A collection of message parts needed to derive the client's signature.
|
||||
#[derive(Debug)]
|
||||
@@ -12,15 +16,18 @@ pub(crate) struct SignatureBuilder<'a> {
|
||||
|
||||
impl SignatureBuilder<'_> {
|
||||
pub(crate) fn build(&self, key: &ScramKey) -> Signature {
|
||||
let parts = [
|
||||
self.client_first_message_bare.as_bytes(),
|
||||
b",",
|
||||
self.server_first_message.as_bytes(),
|
||||
b",",
|
||||
self.client_final_message_without_proof.as_bytes(),
|
||||
];
|
||||
// don't know exactly. this is a rough approx
|
||||
Metrics::get().proxy.sha_rounds.inc_by(8);
|
||||
|
||||
super::hmac_sha256(key.as_ref(), parts).into()
|
||||
let mut mac = Prf::new_from_slice(key.as_ref()).expect("HMAC accepts all key sizes");
|
||||
mac.update(self.client_first_message_bare.as_bytes());
|
||||
mac.update(b",");
|
||||
mac.update(self.server_first_message.as_bytes());
|
||||
mac.update(b",");
|
||||
mac.update(self.client_final_message_without_proof.as_bytes());
|
||||
Signature {
|
||||
bytes: mac.finalize().into_bytes().into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ use futures::FutureExt;
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use super::cache::Pbkdf2Cache;
|
||||
use super::pbkdf2;
|
||||
use super::pbkdf2::Pbkdf2;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId};
|
||||
@@ -23,6 +25,10 @@ use crate::scram::countmin::CountMinSketch;
|
||||
pub struct ThreadPool {
|
||||
runtime: Option<tokio::runtime::Runtime>,
|
||||
pub metrics: Arc<ThreadPoolMetrics>,
|
||||
|
||||
// we hash a lot of passwords.
|
||||
// we keep a cache of partial hashes for faster validation.
|
||||
pub(super) cache: Pbkdf2Cache,
|
||||
}
|
||||
|
||||
/// How often to reset the sketch values
|
||||
@@ -68,6 +74,7 @@ impl ThreadPool {
|
||||
Self {
|
||||
runtime: Some(runtime),
|
||||
metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
|
||||
cache: Pbkdf2Cache::new(),
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -130,7 +137,7 @@ struct JobSpec {
|
||||
}
|
||||
|
||||
impl Future for JobSpec {
|
||||
type Output = [u8; 32];
|
||||
type Output = pbkdf2::Block;
|
||||
|
||||
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
STATE.with_borrow_mut(|state| {
|
||||
@@ -166,10 +173,10 @@ impl Future for JobSpec {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>);
|
||||
pub(crate) struct JobHandle(tokio::task::JoinHandle<pbkdf2::Block>);
|
||||
|
||||
impl Future for JobHandle {
|
||||
type Output = [u8; 32];
|
||||
type Output = pbkdf2::Block;
|
||||
|
||||
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
match self.0.poll_unpin(cx) {
|
||||
@@ -203,10 +210,10 @@ mod tests {
|
||||
.spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
|
||||
.await;
|
||||
|
||||
let expected = [
|
||||
let expected = &[
|
||||
10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
|
||||
178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
assert_eq!(actual.as_slice(), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::time::Duration;
|
||||
use ed25519_dalek::SigningKey;
|
||||
use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
|
||||
use jose_jwk::jose_b64;
|
||||
use postgres_client::error::SqlState;
|
||||
use postgres_client::maybe_tls_stream::MaybeTlsStream;
|
||||
use rand_core::OsRng;
|
||||
use tracing::field::display;
|
||||
@@ -26,7 +27,7 @@ use crate::context::RequestContext;
|
||||
use crate::control_plane::client::ApiLockError;
|
||||
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
|
||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::intern::{EndpointIdInt, RoleNameInt};
|
||||
use crate::pqproto::StartupMessageParams;
|
||||
use crate::proxy::{connect_auth, connect_compute};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
@@ -76,9 +77,11 @@ impl PoolingBackend {
|
||||
};
|
||||
|
||||
let ep = EndpointIdInt::from(&user_info.endpoint);
|
||||
let role = RoleNameInt::from(&user_info.user);
|
||||
let auth_outcome = crate::auth::validate_password_and_exchange(
|
||||
&self.config.authentication_config.thread_pool,
|
||||
&self.config.authentication_config.scram_thread_pool,
|
||||
ep,
|
||||
role,
|
||||
password,
|
||||
secret,
|
||||
)
|
||||
@@ -457,15 +460,14 @@ impl ReportableError for HttpConnError {
|
||||
match self {
|
||||
HttpConnError::ConnectError(_) => ErrorKind::Compute,
|
||||
HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
|
||||
HttpConnError::PostgresConnectionError(p) => {
|
||||
if p.as_db_error().is_some() {
|
||||
// postgres rejected the connection
|
||||
ErrorKind::Postgres
|
||||
} else {
|
||||
// couldn't even reach postgres
|
||||
ErrorKind::Compute
|
||||
}
|
||||
}
|
||||
HttpConnError::PostgresConnectionError(p) => match p.as_db_error() {
|
||||
// user provided a wrong database name
|
||||
Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User,
|
||||
// postgres rejected the connection
|
||||
Some(_) => ErrorKind::Postgres,
|
||||
// couldn't even reach postgres
|
||||
None => ErrorKind::Compute,
|
||||
},
|
||||
HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
|
||||
HttpConnError::ComputeCtl(_) => ErrorKind::Service,
|
||||
HttpConnError::JwtPayloadError(_) => ErrorKind::User,
|
||||
|
||||
@@ -192,34 +192,29 @@ pub(crate) async fn handle(
|
||||
let line = get(db_error, |db| db.line().map(|l| l.to_string()));
|
||||
let routine = get(db_error, |db| db.routine());
|
||||
|
||||
match &e {
|
||||
SqlOverHttpError::Postgres(e)
|
||||
if e.as_db_error().is_some() && error_kind == ErrorKind::User =>
|
||||
{
|
||||
// this error contains too much info, and it's not an error we care about.
|
||||
if tracing::enabled!(Level::DEBUG) {
|
||||
tracing::debug!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
error=%e,
|
||||
msg=message,
|
||||
"forwarding error to user"
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
kind = error_kind.to_metric_label(),
|
||||
error = "bad query",
|
||||
"forwarding error to user"
|
||||
);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
tracing::info!(
|
||||
if db_error.is_some() && error_kind == ErrorKind::User {
|
||||
// this error contains too much info, and it's not an error we care about.
|
||||
if tracing::enabled!(Level::DEBUG) {
|
||||
debug!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
error=%e,
|
||||
msg=message,
|
||||
"forwarding error to user"
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
kind = error_kind.to_metric_label(),
|
||||
error = "bad query",
|
||||
"forwarding error to user"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
info!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
error=%e,
|
||||
msg=message,
|
||||
"forwarding error to user"
|
||||
);
|
||||
}
|
||||
|
||||
json_response(
|
||||
|
||||
@@ -387,6 +387,7 @@ pub fn get_filesystem_usage(path: &std::path::Path) -> u64 {
|
||||
critical_timeline!(
|
||||
placeholder_ttid.tenant_id,
|
||||
placeholder_ttid.timeline_id,
|
||||
None::<&AtomicBool>,
|
||||
"Global disk usage watcher failed to read filesystem usage: {:?}",
|
||||
e
|
||||
);
|
||||
|
||||
@@ -518,6 +518,7 @@ pub async fn time_io_closure<E: Into<anyhow::Error>>(
|
||||
pub struct FullTimelineInfo {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub ps_feedback_count: u64,
|
||||
pub ps_corruption_detected: bool,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub wal_backup_active: bool,
|
||||
pub timeline_is_active: bool,
|
||||
@@ -547,6 +548,7 @@ pub struct TimelineCollector {
|
||||
ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
|
||||
feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
|
||||
ps_feedback_count: GenericGaugeVec<AtomicU64>,
|
||||
ps_corruption_detected: IntGaugeVec,
|
||||
timeline_active: GenericGaugeVec<AtomicU64>,
|
||||
wal_backup_active: GenericGaugeVec<AtomicU64>,
|
||||
connected_computes: IntGaugeVec,
|
||||
@@ -654,6 +656,15 @@ impl TimelineCollector {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let ps_corruption_detected = IntGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_ps_corruption_detected",
|
||||
"1 if corruption was detected in the timeline according to feedback from the pageserver, 0 otherwise",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let timeline_active = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_timeline_active",
|
||||
@@ -774,6 +785,7 @@ impl TimelineCollector {
|
||||
ps_last_received_lsn,
|
||||
feedback_last_time_seconds,
|
||||
ps_feedback_count,
|
||||
ps_corruption_detected,
|
||||
timeline_active,
|
||||
wal_backup_active,
|
||||
connected_computes,
|
||||
@@ -892,6 +904,9 @@ impl Collector for TimelineCollector {
|
||||
self.ps_feedback_count
|
||||
.with_label_values(labels)
|
||||
.set(tli.ps_feedback_count);
|
||||
self.ps_corruption_detected
|
||||
.with_label_values(labels)
|
||||
.set(tli.ps_corruption_detected as i64);
|
||||
if let Ok(unix_time) = tli
|
||||
.last_ps_feedback
|
||||
.replytime
|
||||
@@ -925,6 +940,7 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.ps_last_received_lsn.collect());
|
||||
mfs.extend(self.feedback_last_time_seconds.collect());
|
||||
mfs.extend(self.ps_feedback_count.collect());
|
||||
mfs.extend(self.ps_corruption_detected.collect());
|
||||
mfs.extend(self.timeline_active.collect());
|
||||
mfs.extend(self.wal_backup_active.collect());
|
||||
mfs.extend(self.connected_computes.collect());
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
@@ -305,6 +306,9 @@ impl InterpretedWalReader {
|
||||
critical_timeline!(
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
// Hadron: The corruption flag is only used in PS so that it can feed this information back to SKs.
|
||||
// We do not use these flags in SKs.
|
||||
None::<&AtomicBool>,
|
||||
"failed to read WAL record: {err:?}"
|
||||
);
|
||||
}
|
||||
@@ -375,6 +379,9 @@ impl InterpretedWalReader {
|
||||
critical_timeline!(
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
// Hadron: The corruption flag is only used in PS so that it can feed this information back to SKs.
|
||||
// We do not use these flags in SKs.
|
||||
None::<&AtomicBool>,
|
||||
"failed to decode WAL record: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -55,6 +55,7 @@ pub struct WalSenders {
|
||||
|
||||
pub struct WalSendersTimelineMetricValues {
|
||||
pub ps_feedback_counter: u64,
|
||||
pub ps_corruption_detected: bool,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
}
|
||||
@@ -193,6 +194,7 @@ impl WalSenders {
|
||||
|
||||
WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter: shared.ps_feedback_counter,
|
||||
ps_corruption_detected: shared.ps_corruption_detected,
|
||||
last_ps_feedback: shared.last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
@@ -209,6 +211,9 @@ impl WalSenders {
|
||||
*shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.last_ps_feedback = *feedback;
|
||||
shared.ps_feedback_counter += 1;
|
||||
if feedback.corruption_detected {
|
||||
shared.ps_corruption_detected = true;
|
||||
}
|
||||
drop(shared);
|
||||
|
||||
RECEIVED_PS_FEEDBACKS.inc();
|
||||
@@ -278,6 +283,9 @@ struct WalSendersShared {
|
||||
last_ps_feedback: PageserverFeedback,
|
||||
// total counter of pageserver feedbacks received
|
||||
ps_feedback_counter: u64,
|
||||
// Hadron: true iff we received a pageserver feedback that incidated
|
||||
// data corruption in the timeline
|
||||
ps_corruption_detected: bool,
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
@@ -328,6 +336,7 @@ impl WalSendersShared {
|
||||
agg_standby_feedback: StandbyFeedback::empty(),
|
||||
last_ps_feedback: PageserverFeedback::empty(),
|
||||
ps_feedback_counter: 0,
|
||||
ps_corruption_detected: false,
|
||||
slots: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -839,6 +839,7 @@ impl Timeline {
|
||||
|
||||
let WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter,
|
||||
ps_corruption_detected,
|
||||
last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
} = self.walsenders.info_for_metrics();
|
||||
@@ -847,6 +848,7 @@ impl Timeline {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count: ps_feedback_counter,
|
||||
ps_corruption_detected,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
|
||||
@@ -12,7 +12,7 @@ use futures::stream::{self, FuturesOrdered};
|
||||
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
||||
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
|
||||
use remote_storage::{
|
||||
DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
|
||||
DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
|
||||
};
|
||||
use safekeeper_api::models::PeerInfo;
|
||||
use tokio::fs::File;
|
||||
@@ -607,6 +607,9 @@ pub(crate) async fn copy_partial_segment(
|
||||
storage.copy_object(source, destination, &cancel).await
|
||||
}
|
||||
|
||||
const WAL_READ_WARN_THRESHOLD: u32 = 2;
|
||||
const WAL_READ_MAX_RETRIES: u32 = 3;
|
||||
|
||||
pub async fn read_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
file_path: &RemotePath,
|
||||
@@ -620,12 +623,23 @@ pub async fn read_object(
|
||||
byte_start: std::ops::Bound::Included(offset),
|
||||
..Default::default()
|
||||
};
|
||||
let download = storage
|
||||
.download(file_path, &opts, &cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
})?;
|
||||
|
||||
// This retry only solves the connect errors: subsequent reads can still fail as this function returns
|
||||
// a stream.
|
||||
let download = backoff::retry(
|
||||
|| async { storage.download(file_path, &opts, &cancel).await },
|
||||
DownloadError::is_permanent,
|
||||
WAL_READ_WARN_THRESHOLD,
|
||||
WAL_READ_MAX_RETRIES,
|
||||
"download WAL segment",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| DownloadError::Cancelled)
|
||||
.and_then(|x| x)
|
||||
.with_context(|| {
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
})?;
|
||||
|
||||
let reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
|
||||
@@ -6,13 +6,16 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
use compute_api::spec::PageserverShardInfo;
|
||||
use control_plane::endpoint::{
|
||||
ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo,
|
||||
};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use futures::StreamExt;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT;
|
||||
use pageserver_api::controller_api::AvailabilityZone;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -506,27 +509,64 @@ impl ApiMethod for ComputeHookTenant {
|
||||
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");
|
||||
|
||||
let pageservers = shards
|
||||
.iter()
|
||||
.map(|shard| {
|
||||
let ps_conf = env
|
||||
.get_pageserver_conf(shard.node_id)
|
||||
.expect("Unknown pageserver");
|
||||
if endpoint.grpc {
|
||||
let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address");
|
||||
let (host, port) = parse_host_port(addr).expect("invalid gRPC address");
|
||||
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
|
||||
.expect("Unable to parse listen_pg_addr");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let shard_count = match shards.len() {
|
||||
1 => ShardCount::unsharded(),
|
||||
n => ShardCount(n.try_into().expect("too many shards")),
|
||||
};
|
||||
|
||||
let mut shard_infos: HashMap<ShardIndex, PageserverShardInfo> = HashMap::new();
|
||||
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
|
||||
for shard in shards.iter() {
|
||||
let ps_conf = env
|
||||
.get_pageserver_conf(shard.node_id)
|
||||
.expect("Unknown pageserver");
|
||||
|
||||
let libpq_url = Some({
|
||||
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
|
||||
.expect("Unable to parse listen_pg_addr");
|
||||
let port = port.unwrap_or(5432);
|
||||
format!("postgres://no_user@{host}:{port}")
|
||||
});
|
||||
let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr {
|
||||
let (host, port) =
|
||||
parse_host_port(grpc_addr).expect("invalid gRPC address");
|
||||
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let pageserver = PageserverShardConnectionInfo {
|
||||
id: Some(shard.node_id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![pageserver],
|
||||
};
|
||||
shard_infos.insert(
|
||||
ShardIndex {
|
||||
shard_number: shard.shard_number,
|
||||
shard_count,
|
||||
},
|
||||
shard_info,
|
||||
);
|
||||
}
|
||||
|
||||
let pageserver_conninfo = PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size: stripe_size.map(|val| ShardStripeSize(val.0)),
|
||||
shards: shard_infos,
|
||||
prefer_protocol,
|
||||
};
|
||||
|
||||
endpoint
|
||||
.reconfigure_pageservers(pageservers, *stripe_size)
|
||||
.reconfigure_pageservers(&pageserver_conninfo)
|
||||
.await
|
||||
.map_err(NotifyError::NeonLocal)?;
|
||||
}
|
||||
|
||||
@@ -46,11 +46,31 @@ impl TenantShardDrain {
|
||||
&self,
|
||||
tenants: &BTreeMap<TenantShardId, TenantShard>,
|
||||
scheduler: &Scheduler,
|
||||
) -> Option<NodeId> {
|
||||
let tenant_shard = tenants.get(&self.tenant_shard_id)?;
|
||||
) -> TenantShardDrainAction {
|
||||
let Some(tenant_shard) = tenants.get(&self.tenant_shard_id) else {
|
||||
return TenantShardDrainAction::Skip;
|
||||
};
|
||||
|
||||
if *tenant_shard.intent.get_attached() != Some(self.drained_node) {
|
||||
return None;
|
||||
// If the intent attached node is not the drained node, check the observed state
|
||||
// of the shard on the drained node. If it is Attached*, it means the shard is
|
||||
// beeing migrated from the drained node. The drain loop needs to wait for the
|
||||
// reconciliation to complete for a smooth draining.
|
||||
|
||||
use pageserver_api::models::LocationConfigMode::*;
|
||||
|
||||
let attach_mode = tenant_shard
|
||||
.observed
|
||||
.locations
|
||||
.get(&self.drained_node)
|
||||
.and_then(|observed| observed.conf.as_ref().map(|conf| conf.mode));
|
||||
|
||||
return match (attach_mode, tenant_shard.intent.get_attached()) {
|
||||
(Some(AttachedSingle | AttachedMulti | AttachedStale), Some(intent_node_id)) => {
|
||||
TenantShardDrainAction::Reconcile(*intent_node_id)
|
||||
}
|
||||
_ => TenantShardDrainAction::Skip,
|
||||
};
|
||||
}
|
||||
|
||||
// Only tenants with a normal (Active) scheduling policy are proactively moved
|
||||
@@ -63,19 +83,19 @@ impl TenantShardDrain {
|
||||
}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// If we have been asked to avoid rescheduling this shard, then do not migrate it during a drain
|
||||
return None;
|
||||
return TenantShardDrainAction::Skip;
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_shard.preferred_secondary(scheduler) {
|
||||
Some(node) => Some(node),
|
||||
Some(node) => TenantShardDrainAction::RescheduleToSecondary(node),
|
||||
None => {
|
||||
tracing::warn!(
|
||||
tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
|
||||
"No eligible secondary while draining {}", self.drained_node
|
||||
);
|
||||
|
||||
None
|
||||
TenantShardDrainAction::Skip
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -138,3 +158,17 @@ impl TenantShardDrain {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Action to take when draining a tenant shard.
|
||||
pub(crate) enum TenantShardDrainAction {
|
||||
/// The tenant shard is on the draining node.
|
||||
/// Reschedule the tenant shard to a secondary location.
|
||||
/// Holds a destination node id to reschedule to.
|
||||
RescheduleToSecondary(NodeId),
|
||||
/// The tenant shard is beeing migrated from the draining node.
|
||||
/// Wait for the reconciliation to complete.
|
||||
/// Holds the intent attached node id.
|
||||
Reconcile(NodeId),
|
||||
/// The tenant shard is not eligible for drainining, skip it.
|
||||
Skip,
|
||||
}
|
||||
|
||||
@@ -79,7 +79,7 @@ use crate::id_lock_map::{
|
||||
use crate::leadership::Leadership;
|
||||
use crate::metrics;
|
||||
use crate::node::{AvailabilityTransition, Node};
|
||||
use crate::operation_utils::{self, TenantShardDrain};
|
||||
use crate::operation_utils::{self, TenantShardDrain, TenantShardDrainAction};
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::peer_client::GlobalObservedState;
|
||||
use crate::persistence::split_state::SplitState;
|
||||
@@ -1274,7 +1274,7 @@ impl Service {
|
||||
// Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we
|
||||
// must be responsive when new projects begin ingesting and reach the threshold.
|
||||
self.autosplit_tenants().await;
|
||||
}
|
||||
},
|
||||
_ = self.reconcilers_cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
@@ -8876,6 +8876,9 @@ impl Service {
|
||||
for (_tenant_id, schedule_context, shards) in
|
||||
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative)
|
||||
{
|
||||
if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
|
||||
break;
|
||||
}
|
||||
for shard in shards {
|
||||
if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
|
||||
break;
|
||||
@@ -9640,16 +9643,16 @@ impl Service {
|
||||
tenant_shard_id: tid,
|
||||
};
|
||||
|
||||
let dest_node_id = {
|
||||
let drain_action = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
tid_drain.tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
|
||||
};
|
||||
|
||||
match tid_drain
|
||||
.tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
|
||||
{
|
||||
Some(node_id) => node_id,
|
||||
None => {
|
||||
continue;
|
||||
}
|
||||
let dest_node_id = match drain_action {
|
||||
TenantShardDrainAction::RescheduleToSecondary(dest_node_id) => dest_node_id,
|
||||
TenantShardDrainAction::Reconcile(intent_node_id) => intent_node_id,
|
||||
TenantShardDrainAction::Skip => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -9684,14 +9687,16 @@ impl Service {
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let rescheduled = tid_drain.reschedule_to_secondary(
|
||||
dest_node_id,
|
||||
tenants,
|
||||
scheduler,
|
||||
nodes,
|
||||
)?;
|
||||
|
||||
if let Some(tenant_shard) = rescheduled {
|
||||
let tenant_shard = match drain_action {
|
||||
TenantShardDrainAction::RescheduleToSecondary(dest_node_id) => tid_drain
|
||||
.reschedule_to_secondary(dest_node_id, tenants, scheduler, nodes)?,
|
||||
TenantShardDrainAction::Reconcile(_) => tenants.get_mut(&tid),
|
||||
// Note: Unreachable, handled above.
|
||||
TenantShardDrainAction::Skip => None,
|
||||
};
|
||||
|
||||
if let Some(tenant_shard) = tenant_shard {
|
||||
let waiter = self.maybe_configured_reconcile_shard(
|
||||
tenant_shard,
|
||||
nodes,
|
||||
|
||||
@@ -812,8 +812,6 @@ impl TenantShard {
|
||||
/// if the swap is not possible and leaves the intent state in its original state.
|
||||
///
|
||||
/// Arguments:
|
||||
/// `attached_to`: the currently attached location matching the intent state (may be None if the
|
||||
/// shard is not attached)
|
||||
/// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
|
||||
/// the scheduler to recommend a node
|
||||
pub(crate) fn reschedule_to_secondary(
|
||||
|
||||
@@ -5280,16 +5280,32 @@ class EndpointFactory:
|
||||
)
|
||||
|
||||
def stop_all(self, fail_on_error=True) -> Self:
|
||||
exception = None
|
||||
for ep in self.endpoints:
|
||||
"""
|
||||
Stop all the endpoints in parallel.
|
||||
"""
|
||||
|
||||
# Note: raising an exception from a task in a task group cancels
|
||||
# all the other tasks. We don't want that, hence the 'stop_one'
|
||||
# function catches exceptions and puts them on the 'exceptions'
|
||||
# list for later processing.
|
||||
exceptions = []
|
||||
|
||||
async def stop_one(ep):
|
||||
try:
|
||||
ep.stop()
|
||||
await asyncio.to_thread(ep.stop)
|
||||
except Exception as e:
|
||||
log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
|
||||
exception = e
|
||||
exceptions.append(e)
|
||||
|
||||
if fail_on_error and exception is not None:
|
||||
raise exception
|
||||
async def async_stop_all():
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
for ep in self.endpoints:
|
||||
tg.create_task(stop_one(ep))
|
||||
|
||||
asyncio.run(async_stop_all())
|
||||
|
||||
if fail_on_error and exceptions:
|
||||
raise ExceptionGroup("stopping an endpoint failed", exceptions)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@@ -2,13 +2,15 @@ from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("grpc", [True, False])
|
||||
def test_basebackup_cache(neon_env_builder: NeonEnvBuilder, grpc: bool):
|
||||
"""
|
||||
Simple test for basebackup cache.
|
||||
1. Check that we always hit the cache after compute restart.
|
||||
@@ -22,7 +24,7 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
ep = env.endpoints.create("main")
|
||||
ep = env.endpoints.create("main", grpc=grpc)
|
||||
ps = env.pageserver
|
||||
ps_http = ps.http_client()
|
||||
|
||||
|
||||
@@ -863,6 +863,88 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
|
||||
assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Lakebase mode")
|
||||
def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that when the pageserver detects corruption during image layer creation,
|
||||
it sends corruption feedback to the safekeeper which gets recorded in its
|
||||
safekeeper_ps_corruption_detected metric.
|
||||
"""
|
||||
# Configure tenant with aggressive compaction settings to easily trigger compaction
|
||||
TENANT_CONF = {
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024 * 128,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024 * 128,
|
||||
# Create image layers eagerly
|
||||
"image_creation_threshold": 1,
|
||||
"image_layer_creation_check_threshold": 0,
|
||||
# Force frequent compaction
|
||||
"compaction_period": "1s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
# We are simulating compaction failures so we should allow these error messages.
|
||||
env.pageserver.allowed_errors.append(".*Compaction failed.*")
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
|
||||
# Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected
|
||||
# corruption.
|
||||
pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "return"))
|
||||
|
||||
# Write some data to trigger compaction and image layer creation
|
||||
log.info("Writing data to trigger compaction...")
|
||||
workload.write_rows(1024 * 64, upload=False)
|
||||
workload.write_rows(1024 * 64, upload=False)
|
||||
|
||||
# Returns True if the corruption signal from PS is propagated to the SK according to the "safekeeper_ps_corruption_detected" metric.
|
||||
# Raises an exception otherwise.
|
||||
def check_corruption_signal_propagated_to_sk():
|
||||
# Get metrics from all safekeepers
|
||||
for sk in env.safekeepers:
|
||||
sk_metrics = sk.http_client().get_metrics()
|
||||
# Look for our corruption detected metric with the right tenant and timeline
|
||||
corruption_metrics = sk_metrics.query_all("safekeeper_ps_corruption_detected")
|
||||
|
||||
for metric in corruption_metrics:
|
||||
# Check if there's a metric for our tenant and timeline that has value 1
|
||||
if (
|
||||
metric.labels.get("tenant_id") == str(tenant_id)
|
||||
and metric.labels.get("timeline_id") == str(timeline_id)
|
||||
and metric.value == 1
|
||||
):
|
||||
log.info(f"Corruption detected by safekeeper {sk.id}: {metric}")
|
||||
return True
|
||||
raise Exception("Corruption detection feedback not found in any safekeeper metrics")
|
||||
|
||||
# Returns True if the corruption signal from PS is propagated to the PG according to the "ps_corruption_detected" metric
|
||||
# in "neon_perf_counters".
|
||||
# Raises an exception otherwise.
|
||||
def check_corruption_signal_propagated_to_pg():
|
||||
endpoint = workload.endpoint()
|
||||
results = endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon")
|
||||
results = endpoint.safe_psql(
|
||||
"SELECT value FROM neon_perf_counters WHERE metric = 'ps_corruption_detected'"
|
||||
)
|
||||
log.info("Query corruption detection metric, results: %s", results)
|
||||
if results[0][0] == 1:
|
||||
log.info("Corruption detection signal is raised on Postgres")
|
||||
return True
|
||||
raise Exception("Corruption detection signal is not raise on Postgres")
|
||||
|
||||
# Confirm that the corruption signal propagates to both the safekeeper and Postgres
|
||||
wait_until(check_corruption_signal_propagated_to_sk, timeout=10, interval=0.1)
|
||||
wait_until(check_corruption_signal_propagated_to_pg, timeout=10, interval=0.1)
|
||||
|
||||
# Cleanup the failpoint
|
||||
pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "off"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enabled", [True, False])
|
||||
def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
|
||||
tenant_conf = {
|
||||
|
||||
@@ -129,7 +129,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
Test static endpoint is protected from GC by acquiring and renewing lsn leases.
|
||||
"""
|
||||
|
||||
LSN_LEASE_LENGTH = 8
|
||||
LSN_LEASE_LENGTH = (
|
||||
14 # This value needs to be large enough for compute_ctl to send two lease requests.
|
||||
)
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
# GC is manual triggered.
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -230,6 +233,15 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(f"`SELECT` query succeed after GC, {ctx=}")
|
||||
return offset
|
||||
|
||||
# It's not reliable to let the compute renew the lease in this test case as we have a very tight
|
||||
# lease timeout. Therefore, the test case itself will renew the lease.
|
||||
#
|
||||
# This is a workaround to make the test case more deterministic.
|
||||
def renew_lease(env: NeonEnv, lease_lsn: Lsn):
|
||||
env.storage_controller.pageserver_api().timeline_lsn_lease(
|
||||
env.initial_tenant, env.initial_timeline, lease_lsn
|
||||
)
|
||||
|
||||
# Insert some records on main branch
|
||||
with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
|
||||
with ep_main.cursor() as cur:
|
||||
@@ -242,6 +254,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
XLOG_BLCKSZ = 8192
|
||||
lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)
|
||||
|
||||
# We need to mock the way cplane works: it gets a lease for a branch before starting the compute.
|
||||
renew_lease(env, lsn)
|
||||
|
||||
with env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="static",
|
||||
@@ -251,9 +266,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("SELECT count(*) FROM t0")
|
||||
assert cur.fetchone() == (ROW_COUNT,)
|
||||
|
||||
# Wait for static compute to renew lease at least once.
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
|
||||
generate_updates_on_main(env, ep_main, 3, end=100)
|
||||
|
||||
offset = trigger_gc_and_select(
|
||||
@@ -263,10 +275,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
# Trigger Pageserver restarts
|
||||
for ps in env.pageservers:
|
||||
ps.stop()
|
||||
# Static compute should have at least one lease request failure due to connection.
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
ps.start()
|
||||
|
||||
renew_lease(env, lsn)
|
||||
|
||||
trigger_gc_and_select(
|
||||
env,
|
||||
ep_static,
|
||||
@@ -282,6 +294,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# Wait for static compute to renew lease on the new pageserver.
|
||||
time.sleep(LSN_LEASE_LENGTH + 3)
|
||||
|
||||
trigger_gc_and_select(
|
||||
env,
|
||||
ep_static,
|
||||
@@ -292,7 +307,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Do some update so we can increment gc_cutoff
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
|
||||
# Wait for the existing lease to expire.
|
||||
time.sleep(LSN_LEASE_LENGTH + 1)
|
||||
# Now trigger GC again, layers should be removed.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -768,6 +769,14 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
|
||||
"compaction_period": "0s",
|
||||
}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
# ShardSplit is slow in debug builds, so ignore the warning
|
||||
if os.getenv("BUILD_TYPE", "debug") == "debug":
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Exclusive lock by ShardSplit was held.*",
|
||||
]
|
||||
)
|
||||
|
||||
with env.endpoints.create_start(
|
||||
"main",
|
||||
) as ep:
|
||||
|
||||
Reference in New Issue
Block a user