mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 20:50:37 +00:00
Compare commits
73 Commits
problame/s
...
cloneable/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef737e7d7c | ||
|
|
5c934efb29 | ||
|
|
5c9c3b3317 | ||
|
|
921a4f2009 | ||
|
|
eb93c3e3c6 | ||
|
|
7a7ab2a1d1 | ||
|
|
ff526a1051 | ||
|
|
9a2456bea5 | ||
|
|
a456e818af | ||
|
|
3e6fdb0aa6 | ||
|
|
f8d3f86f58 | ||
|
|
f67a8a173e | ||
|
|
2288efae66 | ||
|
|
4fedcbc0ac | ||
|
|
eb830fa547 | ||
|
|
a203f9829a | ||
|
|
42ab34dc36 | ||
|
|
30b877074c | ||
|
|
f18cc808f0 | ||
|
|
d14d8271b8 | ||
|
|
fecb707b19 | ||
|
|
296c9190b2 | ||
|
|
a5fe67f361 | ||
|
|
ee7bb1a667 | ||
|
|
9bba31bf68 | ||
|
|
380d167b7c | ||
|
|
cb991fba42 | ||
|
|
4566b12a22 | ||
|
|
63ca084696 | ||
|
|
379259bdd7 | ||
|
|
3300207523 | ||
|
|
a0a7733b5a | ||
|
|
f4245403b3 | ||
|
|
a8db7ebffb | ||
|
|
154f6dc59c | ||
|
|
15f633922a | ||
|
|
c34d36d8a2 | ||
|
|
cec0543b51 | ||
|
|
8aa9540a05 | ||
|
|
b91f821e8b | ||
|
|
44ea17b7b2 | ||
|
|
1b7339b53e | ||
|
|
3593fe195a | ||
|
|
c5aaf1ae21 | ||
|
|
13b5e7b26f | ||
|
|
dcdfe80bf0 | ||
|
|
8630d37f5e | ||
|
|
2fc77c836b | ||
|
|
2c6b327be6 | ||
|
|
be5bbaecad | ||
|
|
d33b3c7457 | ||
|
|
ffeede085e | ||
|
|
bdca5b500b | ||
|
|
f4b03ddd7b | ||
|
|
08b19f001c | ||
|
|
1a45b2ec90 | ||
|
|
13e38a58a1 | ||
|
|
2edd59aefb | ||
|
|
0b639ba608 | ||
|
|
28f604d628 | ||
|
|
fe0ddb7169 | ||
|
|
4bbabc092a | ||
|
|
12c26243fc | ||
|
|
2f71eda00f | ||
|
|
5ec82105cc | ||
|
|
78a6daa874 | ||
|
|
5c0de4ee8c | ||
|
|
bc6a756f1c | ||
|
|
8f3351fa91 | ||
|
|
e7d18bc188 | ||
|
|
4ee0da0a20 | ||
|
|
7049003cf7 | ||
|
|
3915995530 |
@@ -27,4 +27,4 @@
|
||||
!storage_controller/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!build_tools/patches
|
||||
!build-tools/patches
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -31,6 +31,7 @@ config-variables:
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- PREWARM_PGBENCH_SIZE
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
|
||||
@@ -176,7 +176,11 @@ runs:
|
||||
fi
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
# We don't use code coverage for regression tests (the step is disabled),
|
||||
# so there's no need to collect it.
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
# cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
@@ -150,7 +150,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
@@ -162,7 +162,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Build all
|
||||
# Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
|
||||
|
||||
72
.github/workflows/benchmarking.yml
vendored
72
.github/workflows/benchmarking.yml
vendored
@@ -219,6 +219,7 @@ jobs:
|
||||
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
|
||||
--ignore test_runner/performance/test_perf_many_relations.py
|
||||
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
|
||||
--ignore test_runner/performance/test_lfc_prewarm.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -410,6 +411,77 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
prewarm-test:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 17
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Run prewarm benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_lfc_prewarm.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
|
||||
DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('build-tools.Dockerfile',
|
||||
${{ hashFiles('build-tools/Dockerfile',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
run: |
|
||||
echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT}
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
|
||||
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
file: build-tools.Dockerfile
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
|
||||
18
.github/workflows/build_and_test.yml
vendored
18
.github/workflows/build_and_test.yml
vendored
@@ -87,6 +87,24 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
lint-openapi-spec:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- run: make lint-openapi-spec
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
|
||||
8
.gitmodules
vendored
8
.gitmodules
vendored
@@ -1,16 +1,16 @@
|
||||
[submodule "vendor/postgres-v14"]
|
||||
path = vendor/postgres-v14
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_14_STABLE_neon
|
||||
[submodule "vendor/postgres-v15"]
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "vendor/postgres-v16"]
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
[submodule "vendor/postgres-v17"]
|
||||
path = vendor/postgres-v17
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_17_STABLE_neon
|
||||
|
||||
17
Cargo.lock
generated
17
Cargo.lock
generated
@@ -1348,6 +1348,7 @@ dependencies = [
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres-types",
|
||||
"postgres_initdb",
|
||||
"postgres_versioninfo",
|
||||
"regex",
|
||||
@@ -4293,7 +4294,9 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"pprof",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -4322,6 +4325,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"thiserror 1.0.69",
|
||||
@@ -4498,10 +4502,15 @@ name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"bytes",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tracing",
|
||||
"utils",
|
||||
@@ -5281,6 +5290,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
"aws-config",
|
||||
"aws-credential-types",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.22.1",
|
||||
@@ -5293,6 +5303,7 @@ dependencies = [
|
||||
"clashmap",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"criterion",
|
||||
"ecdsa 0.16.9",
|
||||
"ed25519-dalek",
|
||||
"env_logger",
|
||||
@@ -5320,6 +5331,7 @@ dependencies = [
|
||||
"itoa",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"json",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -6200,6 +6212,7 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
@@ -6244,7 +6257,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
@@ -6983,6 +6996,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"tokio",
|
||||
@@ -7552,6 +7566,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-stream = { version = "0.1", features = ["sync"] }
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
@@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
pageserver_page_api = { path = "./pageserver/page_api" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
|
||||
9
Makefile
9
Makefile
@@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
.PHONY: lint-openapi-spec
|
||||
lint-openapi-spec:
|
||||
# operation-2xx-response: pageserver timeline delete returns 404 on success
|
||||
find . -iname "openapi_spec.y*ml" -exec\
|
||||
docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
|
||||
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
|
||||
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
|
||||
lint {} \+
|
||||
|
||||
# Targets for building PostgreSQL are defined in postgres.mk.
|
||||
#
|
||||
# But if the caller has indicated that PostgreSQL is already
|
||||
|
||||
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
@@ -9,7 +9,7 @@
|
||||
#
|
||||
# build-tools: This contains Rust compiler toolchain and other tools needed at compile
|
||||
# time. This is also used for the storage builds. This image is defined in
|
||||
# build-tools.Dockerfile.
|
||||
# build-tools/Dockerfile.
|
||||
#
|
||||
# build-deps: Contains C compiler, other build tools, and compile-time dependencies
|
||||
# needed to compile PostgreSQL and most extensions. (Some extensions need
|
||||
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
|
||||
FROM $BASE_IMAGE_SHA AS build-deps
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Keep in sync with build-tools.Dockerfile
|
||||
# Keep in sync with build-tools/Dockerfile
|
||||
ENV PROTOC_VERSION=25.1
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
@@ -1790,7 +1790,7 @@ RUN set -e \
|
||||
#########################################################################################
|
||||
FROM build-deps AS exporters
|
||||
ARG TARGETARCH
|
||||
# Keep sql_exporter version same as in build-tools.Dockerfile and
|
||||
# Keep sql_exporter version same as in build-tools/Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
|
||||
@@ -66,7 +66,7 @@ url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
postgres-types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
compute_api.workspace = true
|
||||
|
||||
@@ -46,11 +46,14 @@ stateDiagram-v2
|
||||
Configuration --> Failed : Failed to configure the compute
|
||||
Configuration --> Running : Compute has been configured
|
||||
Empty --> Init : Compute spec is immediately available
|
||||
Empty --> TerminationPending : Requested termination
|
||||
Empty --> TerminationPendingFast : Requested termination
|
||||
Empty --> TerminationPendingImmediate : Requested termination
|
||||
Init --> Failed : Failed to start Postgres
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPending : Requested termination
|
||||
TerminationPending --> Terminated : Terminated compute
|
||||
Running --> TerminationPendingFast : Requested termination
|
||||
Running --> TerminationPendingImmediate : Requested termination
|
||||
TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
|
||||
TerminationPendingImmediate --> Terminated : Terminated compute immediately
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{spawn, time};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -175,6 +174,7 @@ pub struct ComputeState {
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
/// mode == ComputeMode::Primary. None otherwise
|
||||
pub terminate_flush_lsn: Option<Lsn>,
|
||||
pub promote_state: Option<watch::Receiver<PromoteState>>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
@@ -192,6 +192,7 @@ impl ComputeState {
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -955,14 +956,20 @@ impl ComputeNode {
|
||||
None
|
||||
};
|
||||
|
||||
let mut delay_exit = false;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.terminate_flush_lsn = lsn;
|
||||
if let ComputeStatus::TerminationPending { mode } = state.status {
|
||||
|
||||
let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
|
||||
if state.status == ComputeStatus::TerminationPendingFast
|
||||
|| state.status == ComputeStatus::TerminationPendingImmediate
|
||||
{
|
||||
info!(
|
||||
"Changing compute status from {} to {}",
|
||||
state.status,
|
||||
ComputeStatus::Terminated
|
||||
);
|
||||
state.status = ComputeStatus::Terminated;
|
||||
self.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = mode == compute_api::responses::TerminateMode::Fast
|
||||
}
|
||||
drop(state);
|
||||
|
||||
@@ -1033,6 +1040,8 @@ impl ComputeNode {
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros =
|
||||
connected.duration_since(started).as_micros() as u64;
|
||||
@@ -1042,6 +1051,27 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Move the Zenith signal file to Neon signal file location.
|
||||
/// This makes Compute compatible with older PageServers that don't yet
|
||||
/// know about the Zenith->Neon rename.
|
||||
fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
|
||||
let datadir = Path::new(&self.params.pgdata);
|
||||
|
||||
let neonsig = datadir.join("neon.signal");
|
||||
|
||||
if neonsig.is_file() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let zenithsig = datadir.join("zenith.signal");
|
||||
|
||||
if zenithsig.is_file() {
|
||||
fs::copy(zenithsig, neonsig)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
@@ -1256,9 +1286,7 @@ impl ComputeNode {
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
// and we don't yet retry.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
@@ -1804,6 +1832,8 @@ impl ComputeNode {
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
let max_concurrent_connections = spec.reconfigure_concurrency;
|
||||
// Temporarily reset max_cluster_size in config
|
||||
@@ -1823,10 +1853,9 @@ impl ComputeNode {
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||
info!(
|
||||
@@ -1899,7 +1928,8 @@ impl ComputeNode {
|
||||
|
||||
// exit loop
|
||||
ComputeStatus::Failed
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => break 'cert_update,
|
||||
|
||||
// wait
|
||||
@@ -2455,7 +2485,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
serde_json::to_string(&extensions).expect("failed to serialize extensions list")
|
||||
);
|
||||
}
|
||||
Err(err) => error!("could not get installed extensions: {err:?}"),
|
||||
Err(err) => error!("could not get installed extensions: {err}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
let row = match client
|
||||
.query_one("select * from get_prewarm_info()", &[])
|
||||
.query_one("select * from neon.get_prewarm_info()", &[])
|
||||
.await
|
||||
{
|
||||
Ok(row) => row,
|
||||
@@ -146,7 +146,7 @@ impl ComputeNode {
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select prewarm_local_cache($1)", &[&uncompressed])
|
||||
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())
|
||||
@@ -196,7 +196,7 @@ impl ComputeNode {
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select get_local_cache_state()", &[])
|
||||
.query_one("select neon.get_local_cache_state()", &[])
|
||||
.await
|
||||
.context("querying LFC state")?
|
||||
.try_get::<usize, &[u8]>(0)
|
||||
|
||||
132
compute_tools/src/compute_promote.rs
Normal file
132
compute_tools/src/compute_promote.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::{
|
||||
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
|
||||
spec::ComputeMode,
|
||||
};
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
/// Returns only when promote fails or succeeds. If a network error occurs
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move {
|
||||
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
|
||||
Ok(_) => PromoteState::Completed,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
rx
|
||||
};
|
||||
|
||||
let mut task;
|
||||
// self.state is unlocked after block ends so we lock it in promote_impl
|
||||
// and task.changed() is reached
|
||||
{
|
||||
task = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.promote_state
|
||||
.get_or_insert_with(start_promotion)
|
||||
.clone()
|
||||
}
|
||||
task.changed().await.expect("promote sender dropped");
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
// Why do we have to supply safekeepers?
|
||||
// For secondary we use primary_connection_conninfo so safekeepers field is empty
|
||||
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
if *mode != ComputeMode::Replica {
|
||||
bail!("{} is not replica", mode.to_type_str());
|
||||
}
|
||||
|
||||
// we don't need to query Postgres so not self.lfc_prewarm_state()
|
||||
match &state.lfc_prewarm_state {
|
||||
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
|
||||
bail!("prewarm not requested or pending")
|
||||
}
|
||||
LfcPrewarmState::Failed { error } => {
|
||||
tracing::warn!(%error, "replica prewarm failed")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
|
||||
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
last_wal_replay_lsn = lsn.into();
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
bail!("didn't catch up with primary in {RETRIES} retries");
|
||||
}
|
||||
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
"ALTER SYSTEM SET neon.safekeepers='{}'",
|
||||
safekeepers_lsn.safekeepers
|
||||
);
|
||||
client
|
||||
.query(&safekeepers_sql, &[])
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
bail!("pg_promote() returned false");
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let row = client
|
||||
.query_one("SHOW transaction_read_only", &[])
|
||||
.await
|
||||
.context("getting transaction_read_only")?;
|
||||
if row.get::<usize, &str>(0) == "on" {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -83,6 +83,87 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/promote:
|
||||
post:
|
||||
tags:
|
||||
- Promotion
|
||||
summary: Promote secondary replica to primary
|
||||
description: ""
|
||||
operationId: promoteReplica
|
||||
requestBody:
|
||||
description: Promote requests data
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SafekeepersLsn"
|
||||
responses:
|
||||
200:
|
||||
description: Promote succeeded or wasn't started
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
500:
|
||||
description: Promote failed
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
|
||||
/lfc/prewarm:
|
||||
post:
|
||||
summary: Request LFC Prewarm
|
||||
parameters:
|
||||
- name: from_endpoint
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: ""
|
||||
operationId: lfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: LFC prewarm started
|
||||
429:
|
||||
description: LFC prewarm ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC prewarm state
|
||||
description: ""
|
||||
operationId: getLfcPrewarmState
|
||||
responses:
|
||||
200:
|
||||
description: Prewarm state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
summary: Request LFC offload
|
||||
description: ""
|
||||
operationId: lfcOffload
|
||||
responses:
|
||||
202:
|
||||
description: LFC offload started
|
||||
429:
|
||||
description: LFC offload ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC offloading state
|
||||
description: ""
|
||||
operationId: getLfcOffloadState
|
||||
responses:
|
||||
200:
|
||||
description: Offload state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcOffloadState"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
@@ -290,9 +371,28 @@ paths:
|
||||
summary: Terminate Postgres and wait for it to exit
|
||||
description: ""
|
||||
operationId: terminate
|
||||
parameters:
|
||||
- name: mode
|
||||
in: query
|
||||
description: "Terminate mode: fast (wait 30s before returning) and immediate"
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
enum: ["fast", "immediate"]
|
||||
default: fast
|
||||
responses:
|
||||
200:
|
||||
description: Result
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TerminateResponse"
|
||||
201:
|
||||
description: Result if compute is already terminated
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TerminateResponse"
|
||||
412:
|
||||
description: "wrong state"
|
||||
content:
|
||||
@@ -335,15 +435,6 @@ components:
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod.
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
@@ -458,11 +549,14 @@ components:
|
||||
type: string
|
||||
enum:
|
||||
- empty
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
- configuration_pending
|
||||
- init
|
||||
- running
|
||||
- configuration
|
||||
- failed
|
||||
- termination_pending_fast
|
||||
- termination_pending_immediate
|
||||
- terminated
|
||||
example: running
|
||||
|
||||
ExtensionInstallRequest:
|
||||
@@ -497,25 +591,69 @@ components:
|
||||
type: string
|
||||
example: "1.0.0"
|
||||
|
||||
InstalledExtensions:
|
||||
SafekeepersLsn:
|
||||
type: object
|
||||
required:
|
||||
- safekeepers
|
||||
- wal_flush_lsn
|
||||
properties:
|
||||
extensions:
|
||||
description: Contains list of installed extensions.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
version:
|
||||
type: string
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
owned_by_superuser:
|
||||
type: integer
|
||||
safekeepers:
|
||||
description: Primary replica safekeepers
|
||||
type: string
|
||||
wal_flush_lsn:
|
||||
description: Primary last WAL flush LSN
|
||||
type: string
|
||||
|
||||
LfcPrewarmState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- total
|
||||
- prewarmed
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: Lfc prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc prewarm error, if any
|
||||
type: string
|
||||
total:
|
||||
description: Total pages processed
|
||||
type: integer
|
||||
prewarmed:
|
||||
description: Total pages prewarmed
|
||||
type: integer
|
||||
skipped:
|
||||
description: Pages processed but not prewarmed
|
||||
type: integer
|
||||
|
||||
LfcOffloadState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Lfc offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc offload error, if any
|
||||
type: string
|
||||
|
||||
PromoteState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Promote result
|
||||
enum: [not_promoted, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Promote error, if any
|
||||
type: string
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
@@ -544,6 +682,17 @@ components:
|
||||
description: Role name.
|
||||
example: "neon"
|
||||
|
||||
TerminateResponse:
|
||||
type: object
|
||||
required:
|
||||
- lsn
|
||||
properties:
|
||||
lsn:
|
||||
type: string
|
||||
nullable: true
|
||||
description: "last WAL flush LSN"
|
||||
example: "0/028F10D8"
|
||||
|
||||
SetRoleGrantsResponse:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod promote;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
|
||||
14
compute_tools/src/http/routes/promote.rs
Normal file
14
compute_tools/src/http/routes/promote.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::Form;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
|
||||
) -> axum::response::Response {
|
||||
let state = compute.promote(safekeepers_lsn).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error } = state {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
|
||||
}
|
||||
JsonResponse::success(StatusCode::OK, state)
|
||||
}
|
||||
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::{ComputeStatus, TerminateResponse};
|
||||
use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
|
||||
use http::StatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::sync::Arc;
|
||||
@@ -12,7 +12,7 @@ use tracing::info;
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
pub struct TerminateQuery {
|
||||
mode: compute_api::responses::TerminateMode,
|
||||
mode: TerminateMode,
|
||||
}
|
||||
|
||||
/// Terminate the compute.
|
||||
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
|
||||
let response = TerminateResponse {
|
||||
lsn: state.terminate_flush_lsn,
|
||||
};
|
||||
return JsonResponse::success(StatusCode::CREATED, response);
|
||||
}
|
||||
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
state.set_status(
|
||||
ComputeStatus::TerminationPending { mode },
|
||||
&compute.state_changed,
|
||||
);
|
||||
state.set_status(mode.into(), &compute.state_changed);
|
||||
}
|
||||
|
||||
forward_termination_signal(false);
|
||||
|
||||
@@ -23,7 +23,7 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, lfc, metrics, metrics_json, status, terminate,
|
||||
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use tokio_postgres::error::Error as PostgresError;
|
||||
use tokio_postgres::{Client, Config, NoTls};
|
||||
|
||||
use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
/// and to make database listing query here more explicit.
|
||||
///
|
||||
/// Limit the number of databases to 500 to avoid excessive load.
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
|
||||
// `pg_database.datconnlimit = -2` means that the database is in the
|
||||
// invalid state
|
||||
let databases = client
|
||||
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
/// Same extension can be installed in multiple databases with different versions,
|
||||
/// so we report a separate metric (number of databases where it is installed)
|
||||
/// for each extension version.
|
||||
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
|
||||
pub async fn get_installed_extensions(
|
||||
mut conf: Config,
|
||||
) -> Result<InstalledExtensions, PostgresError> {
|
||||
conf.application_name("compute_ctl:get_installed_extensions");
|
||||
let databases: Vec<String> = {
|
||||
let (mut client, connection) = conf.connect(NoTls).await?;
|
||||
|
||||
@@ -12,6 +12,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod compute_prewarm;
|
||||
pub mod compute_promote;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
|
||||
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_prewarm_errors_total",
|
||||
"Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
|
||||
"Total number of LFC prewarm errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offload_errors_total",
|
||||
"Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
|
||||
"Total number of LFC offload errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -1,3 +1,16 @@
|
||||
-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
|
||||
-- it was found that BYPASSRLS was being applied to all roles.
|
||||
--
|
||||
-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
|
||||
-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
|
||||
--
|
||||
-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
|
||||
-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
|
||||
-- keep the migration around for a long time, if not indefinitely, so any
|
||||
-- cluster can be fixed.
|
||||
--
|
||||
-- Branching is the gift that keeps on giving...
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -7,13 +7,17 @@ BEGIN
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
AND member = 'pg_monitor'::regrole;
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF NOT monitor.member THEN
|
||||
IF monitor IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF monitor.admin IS NULL OR NOT monitor.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF NOT monitor.admin THEN
|
||||
IF monitor.admin IS NULL OR NOT monitor.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
signal_backend record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO signal_backend
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_signal_backend'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF signal_backend IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -84,7 +84,8 @@ impl ComputeMonitor {
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
|
||||
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations)
|
||||
|
||||
@@ -36,7 +36,7 @@ impl StorageBroker {
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
let broker = &self.env.broker;
|
||||
|
||||
print!("Starting neon broker at {}", broker.client_url());
|
||||
println!("Starting neon broker at {}", broker.client_url());
|
||||
|
||||
let mut args = Vec::new();
|
||||
|
||||
|
||||
@@ -32,7 +32,8 @@
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
//! neon.signal
|
||||
//! zenith.signal - copy of neon.signal, for backward compatibility
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
@@ -463,7 +464,7 @@ impl Endpoint {
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_level", "logical");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
// It also defines how often the walreceiver will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.pg_address.ip().to_string());
|
||||
conf.append("port", &self.pg_address.port().to_string());
|
||||
@@ -922,7 +923,8 @@ impl Endpoint {
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
|
||||
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
|
||||
pub posthog_config: Option<PostHogConfig>,
|
||||
|
||||
pub kick_secondary_downloads: Option<bool>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub shard_split_request_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
|
||||
timeline_safekeeper_count: None,
|
||||
posthog_config: None,
|
||||
kick_secondary_downloads: None,
|
||||
shard_split_request_timeout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,7 +303,7 @@ impl PageServerNode {
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
println!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
@@ -452,6 +452,12 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
// HADRON
|
||||
image_layer_force_creation_period: settings
|
||||
.remove("image_layer_force_creation_period")
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_layer_force_creation_period' as duration")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
|
||||
@@ -127,7 +127,7 @@ impl SafekeeperNode {
|
||||
extra_opts: &[String],
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
print!(
|
||||
println!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
|
||||
@@ -648,6 +648,13 @@ impl StorageController {
|
||||
args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
|
||||
}
|
||||
|
||||
if let Some(duration) = self.config.shard_split_request_timeout {
|
||||
args.push(format!(
|
||||
"--shard-split-request-timeout={}",
|
||||
humantime::Duration::from(duration)
|
||||
));
|
||||
}
|
||||
|
||||
let mut envs = vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
@@ -660,7 +667,7 @@ impl StorageController {
|
||||
));
|
||||
}
|
||||
|
||||
println!("Starting storage controller");
|
||||
println!("Starting storage controller at {scheme}://{host}:{listen_port}");
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
|
||||
@@ -14,6 +14,7 @@ humantime.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
safekeeper_api.workspace=true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
|
||||
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Certificate, Method, StatusCode, Url};
|
||||
use safekeeper_api::models::TimelineLocateResponse;
|
||||
use storage_controller_client::control_api::Client;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -279,6 +280,23 @@ enum Command {
|
||||
#[arg(long)]
|
||||
concurrency: Option<usize>,
|
||||
},
|
||||
/// Locate safekeepers for a timeline from the storcon DB.
|
||||
TimelineLocate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
/// Migrate a timeline to a new set of safekeepers
|
||||
TimelineSafekeeperMigrate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
/// Example: --new-sk-set 1,2,3
|
||||
#[arg(long, required = true, value_delimiter = ',')]
|
||||
new_sk_set: Vec<NodeId>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
node_ip_addr: None,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
concurrency,
|
||||
} => {
|
||||
let mut path = format!(
|
||||
"/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
"v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
);
|
||||
|
||||
if let Some(c) = concurrency {
|
||||
@@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::POST, path, None)
|
||||
.await?;
|
||||
}
|
||||
Command::TimelineLocate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} => {
|
||||
let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
|
||||
let new_sk_set = resp
|
||||
.new_sk_set
|
||||
.as_ref()
|
||||
.map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
|
||||
|
||||
println!("generation = {}", resp.generation);
|
||||
println!("sk_set = {sk_set:?}");
|
||||
println!("new_sk_set = {new_sk_set:?}");
|
||||
}
|
||||
Command::TimelineSafekeeperMigrate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
new_sk_set,
|
||||
} => {
|
||||
let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
|
||||
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
path,
|
||||
Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
|
||||
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
|
||||
from WAL.
|
||||
|
||||
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
|
||||
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
|
||||
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
|
||||
This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup
|
||||
code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
|
||||
instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without
|
||||
any WAL redo.
|
||||
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
@@ -75,7 +75,7 @@ CLI examples:
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
For local S3 installations, refer to their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
|
||||
Required sections are:
|
||||
|
||||
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
#[default]
|
||||
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
/// Response of /promote
|
||||
pub enum PromoteState {
|
||||
NotPromoted,
|
||||
Completed,
|
||||
Failed { error: String },
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Result of /safekeepers_lsn
|
||||
pub struct SafekeepersLsn {
|
||||
pub safekeepers: String,
|
||||
pub wal_flush_lsn: utils::lsn::Lsn,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
@@ -93,6 +121,15 @@ pub enum TerminateMode {
|
||||
Immediate,
|
||||
}
|
||||
|
||||
impl From<TerminateMode> for ComputeStatus {
|
||||
fn from(mode: TerminateMode) -> Self {
|
||||
match mode {
|
||||
TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
|
||||
TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
@@ -113,7 +150,9 @@ pub enum ComputeStatus {
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending { mode: TerminateMode },
|
||||
TerminationPendingFast,
|
||||
// Termination requested, without waiting 30s before returning from /terminate
|
||||
TerminationPendingImmediate,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
@@ -132,7 +171,10 @@ impl Display for ComputeStatus {
|
||||
ComputeStatus::Running => f.write_str("running"),
|
||||
ComputeStatus::Configuration => f.write_str("configuration"),
|
||||
ComputeStatus::Failed => f.write_str("failed"),
|
||||
ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
|
||||
ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
|
||||
ComputeStatus::TerminationPendingImmediate => {
|
||||
f.write_str("termination-pending-immediate")
|
||||
}
|
||||
ComputeStatus::Terminated => f.write_str("terminated"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +442,7 @@ pub struct JwksSettings {
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
|
||||
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{Instrument, debug, info, info_span, warn};
|
||||
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
|
||||
|
||||
use crate::error::{ApiError, api_error_handler, route_error_handler};
|
||||
use crate::request::{get_query_param, parse_query_param};
|
||||
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub async fn prometheus_metrics_handler(
|
||||
req: Request<Body>,
|
||||
force_metric_collection_on_scrape: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
// HADRON
|
||||
let requested_use_latest = parse_query_param(&req, "use_latest")?;
|
||||
|
||||
let use_latest = match requested_use_latest {
|
||||
None => force_metric_collection_on_scrape,
|
||||
Some(true) => true,
|
||||
Some(false) => {
|
||||
if force_metric_collection_on_scrape {
|
||||
// We don't cache in this case
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
|
||||
let _span = span.entered();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
// HADRON
|
||||
let collected = if use_latest {
|
||||
// Skip caching the results if we always force metric collection on scrape.
|
||||
METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
|
||||
} else {
|
||||
METRICS_COLLECTOR.last_collected()
|
||||
};
|
||||
|
||||
let gathered_at = std::time::Instant::now();
|
||||
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.encode(&collected.metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
// this instant is not when we finally got the full response sent, sending is done by hyper
|
||||
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
let encoded_in = encoded_at - gathered_at - writer.wait_time();
|
||||
let total = encoded_at - started_at;
|
||||
|
||||
// HADRON
|
||||
let staleness_ms = (encoded_at - collected.collected_at).as_millis();
|
||||
METRICS_STALE_MILLIS.set(staleness_ms as i64);
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
stalenss_ms = staleness_ms,
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -226,6 +226,7 @@ pub struct ConfigToml {
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
pub test_remote_failures: u64,
|
||||
pub test_remote_failures_probability: u64,
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub background_task_maximum_delay: Duration,
|
||||
@@ -271,6 +272,9 @@ pub struct ConfigToml {
|
||||
pub timeline_import_config: TimelineImportConfig,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub basebackup_cache_config: Option<BasebackupCacheConfig>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -560,6 +564,11 @@ pub struct TenantConfigToml {
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// HADRON
|
||||
// When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
|
||||
// (2) create image layers if there are any L1 deltas.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
@@ -758,6 +767,7 @@ impl Default for ConfigToml {
|
||||
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
||||
|
||||
test_remote_failures: (0),
|
||||
test_remote_failures_probability: (100),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||
|
||||
@@ -821,6 +831,8 @@ impl Default for ConfigToml {
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
posthog_config: None,
|
||||
image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
|
||||
force_metric_collection_on_scrape: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -914,6 +926,7 @@ impl Default for TenantConfigToml {
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
image_layer_force_creation_period: None,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_https_port: Option<u16>,
|
||||
|
||||
pub availability_zone_id: AvailabilityZone,
|
||||
|
||||
// Reachable IP address of the PS/SK registering, if known.
|
||||
// Hadron Cluster Coordiantor will update the DNS record of the registering node
|
||||
// with this IP address.
|
||||
pub node_ip_addr: Option<IpAddr>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantTimelineDescribeResponse {
|
||||
pub shards: Vec<TimelineInfo>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShardResponse {
|
||||
pub node_id: NodeId,
|
||||
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineSafekeeperPeer {
|
||||
pub node_id: NodeId,
|
||||
pub listen_http_addr: String,
|
||||
pub http_port: i32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimeline {
|
||||
// SC does not know the tenant id.
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SCSafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimeline {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
|
||||
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
|
||||
pub safekeepers: Vec<SafekeeperInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperInfo {
|
||||
pub id: NodeId,
|
||||
pub hostname: String,
|
||||
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
|
||||
pub gc_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_creation_threshold: FieldPatch<usize>,
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_layer_force_creation_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub pitr_interval: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
@@ -700,6 +703,11 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Option<Duration>,
|
||||
@@ -798,6 +806,7 @@ impl TenantConfig {
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
mut image_layer_force_creation_period,
|
||||
mut pitr_interval,
|
||||
mut walreceiver_connect_timeout,
|
||||
mut lagging_wal_timeout,
|
||||
@@ -861,6 +870,11 @@ impl TenantConfig {
|
||||
patch
|
||||
.image_creation_threshold
|
||||
.apply(&mut image_creation_threshold);
|
||||
// HADRON
|
||||
patch
|
||||
.image_layer_force_creation_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut image_layer_force_creation_period);
|
||||
patch
|
||||
.pitr_interval
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
@@ -942,6 +956,7 @@ impl TenantConfig {
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
image_layer_force_creation_period,
|
||||
pitr_interval,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
@@ -1016,6 +1031,9 @@ impl TenantConfig {
|
||||
image_creation_threshold: self
|
||||
.image_creation_threshold
|
||||
.unwrap_or(global_conf.image_creation_threshold),
|
||||
image_layer_force_creation_period: self
|
||||
.image_layer_force_creation_period
|
||||
.or(global_conf.image_layer_force_creation_period),
|
||||
pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
|
||||
walreceiver_connect_timeout: self
|
||||
.walreceiver_connect_timeout
|
||||
@@ -1604,6 +1622,9 @@ pub struct TimelineInfo {
|
||||
|
||||
/// Whether the timeline is invisible in synthetic size calculations.
|
||||
pub is_invisible: Option<bool>,
|
||||
// HADRON: the largest LSN below which all page updates have been included in the image layers.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
pub fn key_to_shard_number(
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
key: &Key,
|
||||
) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
|
||||
@@ -110,7 +110,6 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_type("XLogRecPtr")
|
||||
.allowlist_type("XLogSegNo")
|
||||
.allowlist_type("TimeLineID")
|
||||
.allowlist_type("TimestampTz")
|
||||
.allowlist_type("MultiXactId")
|
||||
.allowlist_type("MultiXactOffset")
|
||||
.allowlist_type("MultiXactStatus")
|
||||
|
||||
@@ -227,8 +227,7 @@ pub mod walrecord;
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::{
|
||||
BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData,
|
||||
RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32,
|
||||
uint64,
|
||||
RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64,
|
||||
};
|
||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||
|
||||
@@ -4,13 +4,14 @@
|
||||
//! TODO: Generate separate types for each supported PG version
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
|
||||
RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
|
||||
RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
|
||||
};
|
||||
|
||||
#[repr(C)]
|
||||
@@ -863,7 +864,8 @@ pub mod v17 {
|
||||
XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
|
||||
rm_neon,
|
||||
};
|
||||
pub use crate::{TimeLineID, TimestampTz};
|
||||
pub use crate::TimeLineID;
|
||||
pub use postgres_ffi_types::TimestampTz;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -9,10 +9,11 @@
|
||||
|
||||
use super::super::waldecoder::WalStreamDecoder;
|
||||
use super::bindings::{
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
MY_PGVERSION
|
||||
};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use super::wal_generator::LogicalMessageGenerator;
|
||||
use crate::pg_constants;
|
||||
use crate::PG_TLI;
|
||||
|
||||
@@ -11,3 +11,4 @@ pub mod forknum;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type RepOriginId = u16;
|
||||
pub type TimestampTz = i64;
|
||||
|
||||
@@ -43,6 +43,7 @@ itertools.workspace = true
|
||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
|
||||
byteorder = "1.4"
|
||||
rand = "0.8.5"
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
||||
/* BEGIN_HADRON */
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(
|
||||
s,
|
||||
fail_first,
|
||||
fail_probability,
|
||||
)))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||
pub async fn upload_storage_object(
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use rand::Rng;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::num::NonZeroU32;
|
||||
@@ -25,6 +27,13 @@ pub struct UnreliableWrapper {
|
||||
|
||||
// Tracks how many failed attempts of each operation has been made.
|
||||
attempts: Mutex<HashMap<RemoteOp, u64>>,
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// This the probability of failure for each operation, ranged from [0, 100].
|
||||
// The probability is default to 100, which means that all operations will fail.
|
||||
// Storage will fail by probability up to attempts_to_fail times.
|
||||
attempt_failure_probability: u64,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/// Used to identify retries of different unique operation.
|
||||
@@ -40,7 +49,11 @@ enum RemoteOp {
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
pub fn new(
|
||||
inner: crate::GenericRemoteStorage,
|
||||
attempts_to_fail: u64,
|
||||
attempt_failure_probability: u64,
|
||||
) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
@@ -51,9 +64,11 @@ impl UnreliableWrapper {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
attempt_failure_probability: actual_attempt_failure_probability,
|
||||
attempts: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
@@ -66,6 +81,7 @@ impl UnreliableWrapper {
|
||||
///
|
||||
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
||||
let mut attempts = self.attempts.lock().unwrap();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
match attempts.entry(op) {
|
||||
Entry::Occupied(mut e) => {
|
||||
@@ -75,15 +91,19 @@ impl UnreliableWrapper {
|
||||
*p
|
||||
};
|
||||
|
||||
if attempts_before_this >= self.attempts_to_fail {
|
||||
// let it succeed
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
} else {
|
||||
/* BEGIN_HADRON */
|
||||
// If there are more attempts to fail, fail the request by probability.
|
||||
if (attempts_before_this < self.attempts_to_fail)
|
||||
&& (rng.gen_range(0..=100) < self.attempt_failure_probability)
|
||||
{
|
||||
let error =
|
||||
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
Err(error)
|
||||
} else {
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
|
||||
@@ -9,7 +9,7 @@ anyhow.workspace = true
|
||||
const_format.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use postgres_versioninfo::PgVersionId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
use crate::membership::Configuration;
|
||||
use crate::membership::{Configuration, SafekeeperGeneration};
|
||||
use crate::{ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
/// Response to a timeline locate request.
|
||||
/// Storcon-only API.
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineLocateResponse {
|
||||
pub generation: SafekeeperGeneration,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
pub new_sk_set: Option<Vec<NodeId>>,
|
||||
}
|
||||
|
||||
@@ -44,3 +44,63 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub enum DeploymentMode {
|
||||
Local,
|
||||
Dev,
|
||||
Staging,
|
||||
Prod,
|
||||
}
|
||||
|
||||
pub fn get_deployment_mode() -> Option<DeploymentMode> {
|
||||
match std::env::var("DEPLOYMENT_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"development" => Some(DeploymentMode::Dev),
|
||||
"staging" => Some(DeploymentMode::Staging),
|
||||
"production" => Some(DeploymentMode::Prod),
|
||||
_ => {
|
||||
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
// tracing::error!("DEPLOYMENT_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_dev_or_staging() -> bool {
|
||||
matches!(
|
||||
get_deployment_mode(),
|
||||
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
|
||||
)
|
||||
}
|
||||
|
||||
pub enum TestingMode {
|
||||
Chaos,
|
||||
Stress,
|
||||
}
|
||||
|
||||
pub fn get_test_mode() -> Option<TestingMode> {
|
||||
match std::env::var("HADRON_TEST_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"chaos" => Some(TestingMode::Chaos),
|
||||
"stress" => Some(TestingMode::Stress),
|
||||
_ => {
|
||||
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("HADRON_TEST_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_chaos_testing() -> bool {
|
||||
matches!(get_test_mode(), Some(TestingMode::Chaos))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
73
libs/utils/src/ip_address.rs
Normal file
73
libs/utils/src/ip_address.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::env::{VarError, var};
|
||||
use std::error::Error;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
|
||||
/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
|
||||
/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
|
||||
/// template).
|
||||
pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
|
||||
|
||||
/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
|
||||
/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
|
||||
pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
|
||||
match var(HADRON_NODE_IP_ADDRESS) {
|
||||
Ok(v) => {
|
||||
if let Ok(addr) = IpAddr::from_str(&v) {
|
||||
Ok(Some(addr))
|
||||
} else {
|
||||
Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
|
||||
}
|
||||
}
|
||||
Err(VarError::NotPresent) => Ok(None),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::env;
|
||||
use std::net::{Ipv4Addr, Ipv6Addr};
|
||||
|
||||
#[test]
|
||||
fn test_read_node_ip_addr_from_env() {
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
// Test with a valid IPv4 address
|
||||
env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
|
||||
|
||||
// Test with a valid IPv6 address
|
||||
env::set_var(
|
||||
HADRON_NODE_IP_ADDRESS,
|
||||
"2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
);
|
||||
}
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(IpAddr::V6(
|
||||
Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
|
||||
))
|
||||
);
|
||||
|
||||
// Test with an invalid IP address
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
|
||||
}
|
||||
let result = read_node_ip_addr_from_env();
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test with no environment variable set
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
env::remove_var(HADRON_NODE_IP_ADDRESS);
|
||||
}
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(result, None);
|
||||
}
|
||||
}
|
||||
@@ -26,6 +26,9 @@ pub mod auth;
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod id;
|
||||
|
||||
// utility functions to obtain reachable IP addresses in PS/SK nodes.
|
||||
pub mod ip_address;
|
||||
|
||||
pub mod shard;
|
||||
|
||||
mod hex;
|
||||
@@ -99,6 +102,8 @@ pub mod elapsed_accum;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod linux_socket_ioctl;
|
||||
|
||||
pub mod metrics_collector;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, VariantNames};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Logs a critical error, similarly to `tracing::error!`. This will:
|
||||
///
|
||||
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
|
||||
///
|
||||
/// TODO: consider upgrading this to a warning, but currently it fires too often.
|
||||
#[inline]
|
||||
pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
|
||||
where
|
||||
F: Future<Output = O>,
|
||||
{
|
||||
pub async fn log_slow<O>(
|
||||
name: &str,
|
||||
threshold: Duration,
|
||||
f: Pin<&mut impl Future<Output = O>>,
|
||||
) -> O {
|
||||
monitor_slow_future(
|
||||
threshold,
|
||||
threshold, // period = threshold
|
||||
@@ -394,16 +396,42 @@ where
|
||||
if !is_slow {
|
||||
return;
|
||||
}
|
||||
let elapsed = elapsed_total.as_secs_f64();
|
||||
if ready {
|
||||
info!(
|
||||
"slow {name} completed after {:.3}s",
|
||||
elapsed_total.as_secs_f64()
|
||||
);
|
||||
info!("slow {name} completed after {elapsed:.3}s");
|
||||
} else {
|
||||
info!(
|
||||
"slow {name} still running after {:.3}s",
|
||||
elapsed_total.as_secs_f64()
|
||||
);
|
||||
info!("slow {name} still running after {elapsed:.3}s");
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Logs a periodic warning if a future is slow to complete.
|
||||
#[inline]
|
||||
pub async fn warn_slow<O>(
|
||||
name: &str,
|
||||
threshold: Duration,
|
||||
f: Pin<&mut impl Future<Output = O>>,
|
||||
) -> O {
|
||||
monitor_slow_future(
|
||||
threshold,
|
||||
threshold, // period = threshold
|
||||
f,
|
||||
|MonitorSlowFutureCallback {
|
||||
ready,
|
||||
is_slow,
|
||||
elapsed_total,
|
||||
elapsed_since_last_callback: _,
|
||||
}| {
|
||||
if !is_slow {
|
||||
return;
|
||||
}
|
||||
let elapsed = elapsed_total.as_secs_f64();
|
||||
if ready {
|
||||
warn!("slow {name} completed after {elapsed:.3}s");
|
||||
} else {
|
||||
warn!("slow {name} still running after {elapsed:.3}s");
|
||||
}
|
||||
},
|
||||
)
|
||||
@@ -416,7 +444,7 @@ where
|
||||
pub async fn monitor_slow_future<F, O>(
|
||||
threshold: Duration,
|
||||
period: Duration,
|
||||
mut fut: std::pin::Pin<&mut F>,
|
||||
mut fut: Pin<&mut F>,
|
||||
mut cb: impl FnMut(MonitorSlowFutureCallback),
|
||||
) -> O
|
||||
where
|
||||
|
||||
75
libs/utils/src/metrics_collector.rs
Normal file
75
libs/utils/src/metrics_collector.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::{
|
||||
sync::{Arc, RwLock},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"metrics_metrics_stale_milliseconds",
|
||||
"The current metrics stale time in milliseconds"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CollectedMetrics {
|
||||
pub metrics: Vec<MetricFamily>,
|
||||
pub collected_at: Instant,
|
||||
}
|
||||
|
||||
impl CollectedMetrics {
|
||||
fn new(metrics: Vec<MetricFamily>) -> Self {
|
||||
Self {
|
||||
metrics,
|
||||
collected_at: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricsCollector {
|
||||
last_collected: RwLock<Arc<CollectedMetrics>>,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(name = "metrics_collector", skip_all)]
|
||||
pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
|
||||
let started = Instant::now();
|
||||
let metrics = metrics::gather();
|
||||
let collected = Arc::new(CollectedMetrics::new(metrics));
|
||||
if cache_metrics {
|
||||
let mut guard = self.last_collected.write().unwrap();
|
||||
*guard = collected.clone();
|
||||
}
|
||||
tracing::info!(
|
||||
"Collected {} metric families in {} ms",
|
||||
collected.metrics.len(),
|
||||
started.elapsed().as_millis()
|
||||
);
|
||||
collected
|
||||
}
|
||||
|
||||
pub fn last_collected(&self) -> Arc<CollectedMetrics> {
|
||||
self.last_collected.read().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MetricsCollector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
|
||||
pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
|
||||
|
||||
pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
|
||||
@@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardCount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardSlug<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
|
||||
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
shard_number: 0,
|
||||
};
|
||||
|
||||
let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
|
||||
should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
|
||||
sent_bytes: 0,
|
||||
last_recorded_time_us: 0,
|
||||
};
|
||||
|
||||
crate::bindings::WalproposerShmemState {
|
||||
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
donor_name: [0; 64],
|
||||
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
num_shards: 0,
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
wal_rate_limiter: empty_wal_rate_limiter,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::error::Error as _;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -251,6 +251,70 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_timeline_compact(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
force_image_layer_creation: bool,
|
||||
must_force_image_layer_creation: bool,
|
||||
scheduled: bool,
|
||||
wait_until_done: bool,
|
||||
) -> Result<()> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
|
||||
self.mgmt_api_endpoint
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if force_image_layer_creation {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("force_image_layer_creation", "true");
|
||||
}
|
||||
|
||||
if must_force_image_layer_creation {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("must_force_image_layer_creation", "true");
|
||||
}
|
||||
|
||||
if scheduled {
|
||||
path.query_pairs_mut().append_pair("scheduled", "true");
|
||||
}
|
||||
if wait_until_done {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_until_scheduled_compaction_done", "true");
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_until_uploaded", "true");
|
||||
}
|
||||
self.request(Method::PUT, path, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub async fn tenant_timeline_describe(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineInfo> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
path.query_pairs_mut()
|
||||
.append_pair("include-image-consistent-lsn", "true");
|
||||
|
||||
let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
|
||||
let body = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
|
||||
let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
|
||||
let resp = self.get(&uri).await?;
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
pub async fn tenant_scan_remote_storage(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -4,12 +4,20 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = ["pageserver_api/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
bytes.workspace = true
|
||||
compute_api.workspace = true
|
||||
futures.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
604
pageserver/client_grpc/src/client.rs
Normal file
604
pageserver/client_grpc/src/client.rs
Normal file
@@ -0,0 +1,604 @@
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZero;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::anyhow;
|
||||
use arc_swap::ArcSwap;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tracing::{debug, instrument};
|
||||
use utils::logging::warn_slow;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
|
||||
/// when full.
|
||||
///
|
||||
/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
|
||||
/// streams per connection.
|
||||
///
|
||||
/// TODO: tune all of these constants, and consider making them configurable.
|
||||
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
|
||||
/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
|
||||
/// transmission delays. This also concentrates large window sizes on a smaller set of
|
||||
/// streams/connections, presumably reducing memory use.
|
||||
const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// The batch size threshold at which a GetPage request will use the bulk stream pool.
|
||||
///
|
||||
/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
|
||||
/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
|
||||
const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
|
||||
|
||||
/// The overall request call timeout, including retries and pool acquisition.
|
||||
/// TODO: should we retry forever? Should the caller decide?
|
||||
const CALL_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
/// The per-request (retry attempt) timeout, including any lazy connection establishment.
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// The initial request retry backoff duration. The first retry does not back off.
|
||||
/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
|
||||
const BASE_BACKOFF: Duration = Duration::from_millis(5);
|
||||
|
||||
/// The maximum request retry backoff duration.
|
||||
const MAX_BACKOFF: Duration = Duration::from_secs(5);
|
||||
|
||||
/// Threshold and interval for warning about slow operation.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
|
||||
|
||||
/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
|
||||
/// basic `page_api::Client` gRPC client, and supports:
|
||||
///
|
||||
/// * Sharded tenants across multiple Pageservers.
|
||||
/// * Pooling of connections, clients, and streams for efficient resource use.
|
||||
/// * Concurrent use by many callers.
|
||||
/// * Internal handling of GetPage bidirectional streams.
|
||||
/// * Automatic retries.
|
||||
/// * Observability.
|
||||
///
|
||||
/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
|
||||
/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
|
||||
/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
|
||||
/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
|
||||
/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
|
||||
/// one stream per backend, but without the TCP connection overhead. In the common case we expect
|
||||
/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
|
||||
/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
|
||||
///
|
||||
/// TODO: this client does not support base backups or LSN leases, as these are only used by
|
||||
/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
|
||||
pub struct PageserverClient {
|
||||
/// The tenant ID.
|
||||
tenant_id: TenantId,
|
||||
/// The timeline ID.
|
||||
timeline_id: TimelineId,
|
||||
/// The JWT auth token for this tenant, if any.
|
||||
auth_token: Option<String>,
|
||||
/// The compression to use, if any.
|
||||
compression: Option<CompressionEncoding>,
|
||||
/// The shards for this tenant.
|
||||
shards: ArcSwap<Shards>,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
|
||||
/// in the shard spec, which must be complete and must use gRPC URLs.
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_spec: ShardSpec,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let shards = Shards::new(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_spec,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
)?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
auth_token,
|
||||
compression,
|
||||
shards: ArcSwap::new(Arc::new(shards)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Updates the shards from the given shard spec. In-flight requests will complete using the
|
||||
/// existing shards, but may retry with the new shards if they fail.
|
||||
///
|
||||
/// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
|
||||
/// properly spun down and dropped afterwards.
|
||||
pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
|
||||
// Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
|
||||
// with concurrent updates, but that involves creating a new `Shards` on every attempt,
|
||||
// which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
|
||||
// in the stack, and if they're violated then we already have problems elsewhere, so a
|
||||
// best-effort but possibly-racy check is okay here.
|
||||
let old = self.shards.load_full();
|
||||
if shard_spec.count < old.count {
|
||||
return Err(anyhow!(
|
||||
"can't reduce shard count from {} to {}",
|
||||
old.count,
|
||||
shard_spec.count
|
||||
));
|
||||
}
|
||||
if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
|
||||
return Err(anyhow!(
|
||||
"can't change stripe size from {} to {}",
|
||||
old.stripe_size,
|
||||
shard_spec.stripe_size
|
||||
));
|
||||
}
|
||||
|
||||
let shards = Shards::new(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
shard_spec,
|
||||
self.auth_token.clone(),
|
||||
self.compression,
|
||||
)?;
|
||||
self.shards.store(Arc::new(shards));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn check_rel_exists(
|
||||
&self,
|
||||
req: page_api::CheckRelExistsRequest,
|
||||
) -> tonic::Result<page_api::CheckRelExistsResponse> {
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
#[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
req: page_api::GetDbSizeRequest,
|
||||
) -> tonic::Result<page_api::GetDbSizeResponse> {
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
|
||||
/// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
|
||||
/// shard boundaries, and assembles the responses.
|
||||
///
|
||||
/// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
|
||||
/// errors. All responses will have `GetPageStatusCode::Ok`.
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
class = %req.request_class,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
pub async fn get_page(
|
||||
&self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Make sure we have at least one page.
|
||||
if req.block_numbers.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("no block number"));
|
||||
}
|
||||
// The request attempt must be 0. The client will increment it internally.
|
||||
if req.request_id.attempt != 0 {
|
||||
return Err(tonic::Status::invalid_argument("request attempt must be 0"));
|
||||
}
|
||||
|
||||
debug!("sending request: {req:?}");
|
||||
|
||||
// The shards may change while we're fetching pages. We execute the request using a stable
|
||||
// view of the shards (especially important for requests that span shards), but retry the
|
||||
// top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
|
||||
// retries and re-splits in some cases where requests span shards, but these are expected to
|
||||
// be rare.
|
||||
//
|
||||
// TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
|
||||
// once we figure out how to handle these.
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
|
||||
let mut req = req.clone();
|
||||
req.request_id.attempt = attempt as u32;
|
||||
let shards = self.shards.load_full();
|
||||
Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
|
||||
})
|
||||
.await?;
|
||||
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
|
||||
/// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
|
||||
async fn get_page_with_shards(
|
||||
req: page_api::GetPageRequest,
|
||||
shards: &Shards,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
|
||||
{
|
||||
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_id, shard_req) in splitter.drain_requests() {
|
||||
let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
|
||||
.map(move |result| result.map(|resp| (shard_id, resp)));
|
||||
shard_requests.push(future);
|
||||
}
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter.get_response()
|
||||
}
|
||||
|
||||
/// Fetches pages on the given shard. Does not retry internally.
|
||||
async fn get_page_with_shard(
|
||||
req: page_api::GetPageRequest,
|
||||
shard: &Shard,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let mut stream = shard.stream(Self::is_bulk(&req)).await?;
|
||||
let resp = stream.send(req.clone()).await?;
|
||||
|
||||
// Convert per-request errors into a tonic::Status.
|
||||
if resp.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(tonic::Status::new(
|
||||
resp.status_code.into(),
|
||||
resp.reason.unwrap_or_else(|| String::from("unknown error")),
|
||||
));
|
||||
}
|
||||
|
||||
// Check that we received the expected pages.
|
||||
if req.rel != resp.rel {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {} returned wrong relation, expected {} got {}",
|
||||
shard.id, req.rel, resp.rel
|
||||
)));
|
||||
}
|
||||
if !req
|
||||
.block_numbers
|
||||
.iter()
|
||||
.copied()
|
||||
.eq(resp.pages.iter().map(|p| p.block_number))
|
||||
{
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {} returned wrong pages, expected {:?} got {:?}",
|
||||
shard.id,
|
||||
req.block_numbers,
|
||||
resp.pages
|
||||
.iter()
|
||||
.map(|page| page.block_number)
|
||||
.collect::<Vec<_>>()
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
req: page_api::GetRelSizeRequest,
|
||||
) -> tonic::Result<page_api::GetRelSizeResponse> {
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
#[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
|
||||
pub async fn get_slru_segment(
|
||||
&self,
|
||||
req: page_api::GetSlruSegmentRequest,
|
||||
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
|
||||
/// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
|
||||
async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut(usize) -> O, // pass attempt number, starting at 0
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
Retry {
|
||||
timeout: Some(timeout),
|
||||
base_backoff: BASE_BACKOFF,
|
||||
max_backoff: MAX_BACKOFF,
|
||||
}
|
||||
.with(f)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
|
||||
async fn with_timeout<T>(
|
||||
timeout: Duration,
|
||||
f: impl Future<Output = tonic::Result<T>>,
|
||||
) -> tonic::Result<T> {
|
||||
let started = Instant::now();
|
||||
tokio::time::timeout(timeout, f).await.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
started.elapsed().as_secs_f64()
|
||||
))
|
||||
})?
|
||||
}
|
||||
|
||||
/// Returns true if the request is considered a bulk request and should use the bulk pool.
|
||||
fn is_bulk(req: &page_api::GetPageRequest) -> bool {
|
||||
req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
/// Shard specification for a PageserverClient.
|
||||
pub struct ShardSpec {
|
||||
/// Maps shard indices to gRPC URLs.
|
||||
///
|
||||
/// INVARIANT: every shard 0..count is present, and shard 0 is always present.
|
||||
/// INVARIANT: every URL is valid and uses grpc:// scheme.
|
||||
urls: HashMap<ShardIndex, String>,
|
||||
/// The shard count.
|
||||
///
|
||||
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
|
||||
count: ShardCount,
|
||||
/// The stripe size for these shards.
|
||||
stripe_size: ShardStripeSize,
|
||||
}
|
||||
|
||||
impl ShardSpec {
|
||||
/// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
|
||||
/// The stripe size may be omitted for unsharded tenants.
|
||||
pub fn new(
|
||||
urls: HashMap<ShardIndex, String>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Compute the shard count.
|
||||
let count = match urls.len() {
|
||||
0 => return Err(anyhow!("no shards provided")),
|
||||
1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
|
||||
n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
|
||||
n => ShardCount::new(n as u8),
|
||||
};
|
||||
|
||||
// Determine the stripe size. It doesn't matter for unsharded tenants.
|
||||
if stripe_size.is_none() && !count.is_unsharded() {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
}
|
||||
let stripe_size = stripe_size.unwrap_or_default();
|
||||
|
||||
// Validate the shard spec.
|
||||
for (shard_id, url) in &urls {
|
||||
// The shard index must match the computed shard count, even for unsharded tenants.
|
||||
if shard_id.shard_count != count {
|
||||
return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
|
||||
}
|
||||
// The shard index' number and count must be consistent.
|
||||
if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
|
||||
return Err(anyhow!("invalid shard index {shard_id}"));
|
||||
}
|
||||
// The above conditions guarantee that we have all shards 0..count: len() matches count,
|
||||
// shard number < count, and numbers are unique (via hashmap).
|
||||
|
||||
// Validate the URL.
|
||||
if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
|
||||
return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
urls,
|
||||
count,
|
||||
stripe_size,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks the tenant's shards.
|
||||
struct Shards {
|
||||
/// Shards by shard index.
|
||||
///
|
||||
/// INVARIANT: every shard 0..count is present.
|
||||
/// INVARIANT: shard 0 is always present.
|
||||
by_index: HashMap<ShardIndex, Shard>,
|
||||
/// The shard count.
|
||||
///
|
||||
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
|
||||
count: ShardCount,
|
||||
/// The stripe size. Only used for sharded tenants.
|
||||
stripe_size: ShardStripeSize,
|
||||
}
|
||||
|
||||
impl Shards {
|
||||
/// Creates a new set of shards based on a shard spec.
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_spec: ShardSpec,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// NB: the shard spec has already been validated when constructed.
|
||||
let mut shards = HashMap::with_capacity(shard_spec.urls.len());
|
||||
for (shard_id, url) in shard_spec.urls {
|
||||
shards.insert(
|
||||
shard_id,
|
||||
Shard::new(
|
||||
url,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
)?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
by_index: shards,
|
||||
count: shard_spec.count,
|
||||
stripe_size: shard_spec.stripe_size,
|
||||
})
|
||||
}
|
||||
|
||||
/// Looks up the given shard.
|
||||
#[allow(clippy::result_large_err)] // TODO: check perf impact
|
||||
fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
|
||||
self.by_index
|
||||
.get(&shard_id)
|
||||
.ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
|
||||
}
|
||||
|
||||
/// Returns shard 0.
|
||||
fn get_zero(&self) -> &Shard {
|
||||
self.get(ShardIndex::new(ShardNumber(0), self.count))
|
||||
.expect("always present")
|
||||
}
|
||||
}
|
||||
|
||||
/// A single shard. Has dedicated resource pools with the following structure:
|
||||
///
|
||||
/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
|
||||
/// * Client pool: unbounded.
|
||||
/// * Stream pool: unbounded.
|
||||
/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
|
||||
/// * Bulk client pool: unbounded.
|
||||
/// * Bulk stream pool: unbounded.
|
||||
///
|
||||
/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
|
||||
/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
|
||||
/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
|
||||
/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
|
||||
/// similar (except for TCP transmission time).
|
||||
///
|
||||
/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
|
||||
/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
|
||||
struct Shard {
|
||||
/// The shard ID.
|
||||
id: ShardIndex,
|
||||
/// Unary gRPC client pool.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// GetPage stream pool.
|
||||
stream_pool: Arc<StreamPool>,
|
||||
/// GetPage stream pool for bulk requests.
|
||||
bulk_stream_pool: Arc<StreamPool>,
|
||||
}
|
||||
|
||||
impl Shard {
|
||||
/// Creates a new shard. It has its own dedicated resource pools.
|
||||
fn new(
|
||||
url: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Shard pools for unary requests and non-bulk GetPage requests.
|
||||
let client_pool = ClientPool::new(
|
||||
ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
None, // unbounded
|
||||
);
|
||||
let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
|
||||
|
||||
// Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
|
||||
let bulk_stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
None, // unbounded,
|
||||
),
|
||||
None, // unbounded
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
id: shard_id,
|
||||
client_pool,
|
||||
stream_pool,
|
||||
bulk_stream_pool,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a pooled client for this shard.
|
||||
#[instrument(skip_all)]
|
||||
async fn client(&self) -> tonic::Result<ClientGuard> {
|
||||
warn_slow(
|
||||
"client pool acquisition",
|
||||
SLOW_THRESHOLD,
|
||||
pin!(self.client_pool.get()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
|
||||
#[instrument(skip_all, fields(bulk))]
|
||||
async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
|
||||
let pool = match bulk {
|
||||
false => &self.stream_pool,
|
||||
true => &self.bulk_stream_pool,
|
||||
};
|
||||
warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
|
||||
}
|
||||
}
|
||||
@@ -1,14 +1,6 @@
|
||||
//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client`
|
||||
//! gRPC client, and supports:
|
||||
//!
|
||||
//! * Sharded tenants across multiple Pageservers.
|
||||
//! * Pooling of connections, clients, and streams for efficient resource use.
|
||||
//! * Concurrent use by many callers.
|
||||
//! * Internal handling of GetPage bidirectional streams.
|
||||
//! * Automatic retries.
|
||||
//! * Observability.
|
||||
//!
|
||||
//! The client is under development, this package is just a shell.
|
||||
|
||||
#[allow(unused)]
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
|
||||
@@ -9,19 +9,36 @@
|
||||
//!
|
||||
//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
|
||||
//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
|
||||
//! per-channel client limit. Channels may be closed when they are no longer used by any clients.
|
||||
//! per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
|
||||
//! client/stream idle timeouts.
|
||||
//!
|
||||
//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
|
||||
//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
|
||||
//! from the pool after some time, to free up the channel.
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients are removed
|
||||
//! from the pool after a while to free up resources.
|
||||
//!
|
||||
//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
|
||||
//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
|
||||
//! returns a guard that can be used to send a single request, to properly enforce queue depth and
|
||||
//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
|
||||
//! possibly pipelining multiple requests from multiple callers on the same stream (up to some
|
||||
//! queue depth). Idle streams may be removed from the pool after a while to free up the client.
|
||||
//! ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
|
||||
//! time, and is returned to the pool when dropped. Idle streams are removed from the pool after
|
||||
//! a while to free up resources.
|
||||
//!
|
||||
//! The stream only supports sending a single, synchronous request at a time, and does not support
|
||||
//! pipelining multiple requests from different callers onto the same stream -- instead, we scale
|
||||
//! out concurrent streams to improve throughput. There are many reasons for this design choice:
|
||||
//!
|
||||
//! * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
|
||||
//! a single server task, which may block e.g. on layer downloads, LSN waits, etc.
|
||||
//!
|
||||
//! * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
|
||||
//! (e.g. because of a timeout), the request would still be processed by the server and block
|
||||
//! requests behind it in the stream. It might even block its own timeout retry.
|
||||
//!
|
||||
//! * Stream scheduling becomes significantly simpler and cheaper.
|
||||
//!
|
||||
//! * Individual callers can still use client-side batching for pipelining.
|
||||
//!
|
||||
//! * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
|
||||
//! per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
|
||||
//!
|
||||
//! Each channel corresponds to one TCP connection. Each client unary request and each stream
|
||||
//! corresponds to one HTTP/2 stream and server task.
|
||||
@@ -29,46 +46,56 @@
|
||||
//! TODO: error handling (including custom error types).
|
||||
//! TODO: observability.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::collections::BTreeMap;
|
||||
use std::num::NonZero;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use futures::StreamExt as _;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
|
||||
use futures::{Stream, StreamExt as _};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
|
||||
use tokio_stream::wrappers::WatchStream;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tracing::{error, warn};
|
||||
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// Max number of concurrent clients per channel.
|
||||
/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
|
||||
/// empty, and indirectly rely on the client/stream idle timeouts.
|
||||
///
|
||||
/// TODO: tune these constants, and make them configurable.
|
||||
/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
|
||||
/// with only streams.
|
||||
const CLIENTS_PER_CHANNEL: usize = 16;
|
||||
/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
|
||||
/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
|
||||
/// keep its client around in the pool for a while.
|
||||
const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
|
||||
false => Duration::from_secs(180),
|
||||
true => Duration::from_secs(1), // exercise reaping in tests
|
||||
};
|
||||
|
||||
/// Maximum number of concurrent clients per `ClientPool`.
|
||||
const CLIENT_LIMIT: usize = 64;
|
||||
|
||||
/// Max number of pipelined requests per gRPC GetPage stream.
|
||||
const STREAM_QUEUE_DEPTH: usize = 2;
|
||||
/// Reap idle resources with this interval.
|
||||
const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
|
||||
false => Duration::from_secs(10),
|
||||
true => Duration::from_secs(1), // exercise reaping in tests
|
||||
};
|
||||
|
||||
/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
|
||||
/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
|
||||
/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
|
||||
/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
|
||||
/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
|
||||
/// `StreamPool` to limit the number of concurrent clients.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
///
|
||||
/// TODO: reap idle channels.
|
||||
/// TODO: consider prewarming a set of channels, to avoid initial connection latency.
|
||||
/// TODO: consider adding a circuit breaker for errors and fail fast.
|
||||
pub struct ChannelPool {
|
||||
/// Pageserver endpoint to connect to.
|
||||
endpoint: Endpoint,
|
||||
/// Max number of clients per channel. Beyond this, a new channel will be created.
|
||||
max_clients_per_channel: NonZero<usize>,
|
||||
/// Open channels.
|
||||
channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
|
||||
/// Channel ID generator.
|
||||
@@ -86,13 +113,14 @@ struct ChannelEntry {
|
||||
|
||||
impl ChannelPool {
|
||||
/// Creates a new channel pool for the given Pageserver endpoint.
|
||||
pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
|
||||
pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
|
||||
where
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
Ok(Arc::new(Self {
|
||||
endpoint: endpoint.try_into()?,
|
||||
max_clients_per_channel,
|
||||
channels: Mutex::default(),
|
||||
next_channel_id: AtomicUsize::default(),
|
||||
}))
|
||||
@@ -116,12 +144,16 @@ impl ChannelPool {
|
||||
let mut channels = self.channels.lock().unwrap();
|
||||
|
||||
// Try to find an existing channel with available capacity. We check entries in BTreeMap
|
||||
// order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
|
||||
// with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// order, to fill up the lower-ordered channels first. The client/stream pools also prefer
|
||||
// clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// channels, and free up higher-ordered channels such that they can be reaped.
|
||||
for (&id, entry) in channels.iter_mut() {
|
||||
assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
|
||||
if entry.clients < CLIENTS_PER_CHANNEL {
|
||||
assert!(
|
||||
entry.clients <= self.max_clients_per_channel.get(),
|
||||
"channel overflow"
|
||||
);
|
||||
assert_ne!(entry.clients, 0, "empty channel not reaped");
|
||||
if entry.clients < self.max_clients_per_channel.get() {
|
||||
entry.clients += 1;
|
||||
return ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
@@ -166,26 +198,30 @@ impl ChannelGuard {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the channel to the pool.
|
||||
/// Returns the channel to the pool. The channel is closed when empty.
|
||||
impl Drop for ChannelGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
let mut channels = pool.channels.lock().unwrap();
|
||||
let entry = channels.get_mut(&self.id).expect("unknown channel");
|
||||
assert!(entry.clients > 0, "channel underflow");
|
||||
entry.clients -= 1;
|
||||
|
||||
// Reap empty channels immediately.
|
||||
if entry.clients == 0 {
|
||||
channels.remove(&self.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
|
||||
/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
|
||||
/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
|
||||
/// number of concurrent clients to `max_clients` via semaphore.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
///
|
||||
/// TODO: reap idle clients.
|
||||
pub struct ClientPool {
|
||||
/// Tenant ID.
|
||||
tenant_id: TenantId,
|
||||
@@ -195,17 +231,20 @@ pub struct ClientPool {
|
||||
shard_id: ShardIndex,
|
||||
/// Authentication token, if any.
|
||||
auth_token: Option<String>,
|
||||
/// Compression to use.
|
||||
compression: Option<CompressionEncoding>,
|
||||
/// Channel pool to acquire channels from.
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
/// Limits the max number of concurrent clients for this pool.
|
||||
limiter: Arc<Semaphore>,
|
||||
/// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Idle pooled clients. Acquired clients are removed from here and returned on drop.
|
||||
///
|
||||
/// The first client in the map will be acquired next. The map is sorted by client ID, which in
|
||||
/// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
|
||||
/// clients are reaped.
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
|
||||
idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
|
||||
/// Reaps idle clients.
|
||||
idle_reaper: Reaper,
|
||||
/// Unique client ID generator.
|
||||
next_client_id: AtomicUsize,
|
||||
}
|
||||
@@ -217,44 +256,53 @@ struct ClientEntry {
|
||||
client: page_api::Client,
|
||||
/// The channel guard for the channel used by the client.
|
||||
channel_guard: ChannelGuard,
|
||||
/// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
|
||||
/// definition, so this is the time when it was added back to the pool.
|
||||
idle_since: Instant,
|
||||
}
|
||||
|
||||
impl ClientPool {
|
||||
/// Creates a new client pool for the given tenant shard. Channels are acquired from the given
|
||||
/// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
|
||||
/// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
|
||||
/// `max_clients` concurrent clients, or unbounded if None.
|
||||
pub fn new(
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
max_clients: Option<NonZero<usize>>,
|
||||
) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
let pool = Arc::new(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
channel_pool,
|
||||
idle: Mutex::default(),
|
||||
limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
|
||||
next_client_id: AtomicUsize::default(),
|
||||
})
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
pool
|
||||
}
|
||||
|
||||
/// Gets a client from the pool, or creates a new one if necessary. Connections are established
|
||||
/// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The
|
||||
/// client is returned to the pool when the guard is dropped.
|
||||
/// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
|
||||
/// is returned to the pool when the guard is dropped.
|
||||
///
|
||||
/// This is moderately performance-sensitive. It is called for every unary request, but these
|
||||
/// establish a new gRPC stream per request so they're already expensive. GetPage requests use
|
||||
/// the `StreamPool` instead.
|
||||
pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
|
||||
let permit = self
|
||||
.limiter
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("never closed");
|
||||
pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
|
||||
// Fast path: acquire an idle client from the pool.
|
||||
if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
|
||||
@@ -267,7 +315,7 @@ impl ClientPool {
|
||||
});
|
||||
}
|
||||
|
||||
// Slow path: construct a new client.
|
||||
// Construct a new client.
|
||||
let mut channel_guard = self.channel_pool.get();
|
||||
let client = page_api::Client::new(
|
||||
channel_guard.take(),
|
||||
@@ -275,8 +323,9 @@ impl ClientPool {
|
||||
self.timeline_id,
|
||||
self.shard_id,
|
||||
self.auth_token.clone(),
|
||||
None,
|
||||
)?;
|
||||
self.compression,
|
||||
)
|
||||
.map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
|
||||
|
||||
Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
@@ -291,14 +340,24 @@ impl ClientPool {
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for ClientPool {
|
||||
/// Reaps clients that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.retain(|_, entry| entry.idle_since >= cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
/// A client acquired from the pool. The inner client can be accessed via Deref. The client is
|
||||
/// returned to the pool when dropped.
|
||||
pub struct ClientGuard {
|
||||
pool: Weak<ClientPool>,
|
||||
id: ClientID,
|
||||
client: Option<page_api::Client>, // Some until dropped
|
||||
channel_guard: Option<ChannelGuard>, // Some until dropped
|
||||
permit: OwnedSemaphorePermit,
|
||||
client: Option<page_api::Client>, // Some until dropped
|
||||
channel_guard: Option<ChannelGuard>, // Some until dropped
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl Deref for ClientGuard {
|
||||
@@ -321,9 +380,11 @@ impl Drop for ClientGuard {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
let entry = ClientEntry {
|
||||
client: self.client.take().expect("dropped once"),
|
||||
channel_guard: self.channel_guard.take().expect("dropped once"),
|
||||
idle_since: Instant::now(),
|
||||
};
|
||||
pool.idle.lock().unwrap().insert(self.id, entry);
|
||||
|
||||
@@ -334,253 +395,268 @@ impl Drop for ClientGuard {
|
||||
/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
|
||||
/// acquires a client from the inner `ClientPool` for the stream's lifetime.
|
||||
///
|
||||
/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
|
||||
/// a single request and await the response. Internally, requests are multiplexed across streams and
|
||||
/// channels. This allows proper queue depth enforcement and response routing.
|
||||
/// Individual streams only send a single request at a time, and do not pipeline multiple callers
|
||||
/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
|
||||
/// to eliminate head-of-line blocking. See the module documentation for more details.
|
||||
///
|
||||
/// TODO: reap idle streams.
|
||||
/// TODO: consider making this generic over request and response types; not currently needed.
|
||||
pub struct StreamPool {
|
||||
/// The client pool to acquire clients from.
|
||||
/// The client pool to acquire clients from. Must be unbounded.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// All pooled streams.
|
||||
/// Idle pooled streams. Acquired streams are removed from here and returned on drop.
|
||||
///
|
||||
/// Incoming requests will be sent over an existing stream with available capacity. If all
|
||||
/// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit).
|
||||
/// Each stream has an associated Tokio task that processes requests and responses.
|
||||
streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
|
||||
/// Limits the max number of concurrent requests (not streams).
|
||||
limiter: Arc<Semaphore>,
|
||||
/// Stream ID generator.
|
||||
next_stream_id: AtomicUsize,
|
||||
/// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
|
||||
/// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
|
||||
/// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
|
||||
/// higher-ordered channels.
|
||||
idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
|
||||
/// Limits the max number of concurrent streams. None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Reaps idle streams.
|
||||
idle_reaper: Reaper,
|
||||
}
|
||||
|
||||
type StreamID = usize;
|
||||
type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
|
||||
/// The stream ID. Reuses the inner client ID.
|
||||
type StreamID = ClientID;
|
||||
|
||||
/// A pooled stream.
|
||||
struct StreamEntry {
|
||||
/// Sends caller requests to the stream task. The stream task exits when this is dropped.
|
||||
sender: RequestSender,
|
||||
/// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
|
||||
/// completion without acquiring the `StreamPool::streams` lock.
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
/// The bidirectional stream.
|
||||
stream: BiStream,
|
||||
/// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
|
||||
idle_since: Instant,
|
||||
}
|
||||
|
||||
/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
|
||||
struct BiStream {
|
||||
/// The owning client. Holds onto the channel slot while the stream is alive.
|
||||
client: ClientGuard,
|
||||
/// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
|
||||
/// time, and the caller must await the response before sending another request. This is
|
||||
/// enforced by `StreamGuard::send`.
|
||||
sender: watch::Sender<page_api::GetPageRequest>,
|
||||
/// Stream for receiving responses.
|
||||
receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
|
||||
}
|
||||
|
||||
impl StreamPool {
|
||||
/// Creates a new stream pool, using the given client pool.
|
||||
/// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
|
||||
/// concurrent streams.
|
||||
///
|
||||
/// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may
|
||||
/// fill up the client pool and starve out unary requests. Client pools can share the same
|
||||
/// `ChannelPool` though, since the channel pool is unbounded.
|
||||
pub fn new(client_pool: Arc<ClientPool>) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
/// The client pool must be unbounded. The stream pool will enforce its own limits, and because
|
||||
/// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
|
||||
/// The stream pool should generally have its own dedicated client pool (but it can share a
|
||||
/// channel pool with others since these are always unbounded).
|
||||
pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
|
||||
assert!(client_pool.limiter.is_none(), "bounded client pool");
|
||||
let pool = Arc::new(Self {
|
||||
client_pool,
|
||||
streams: Arc::default(),
|
||||
limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)),
|
||||
next_stream_id: AtomicUsize::default(),
|
||||
})
|
||||
idle: Mutex::default(),
|
||||
limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
pool
|
||||
}
|
||||
|
||||
/// Acquires an available stream from the pool, or spins up a new stream async if all streams
|
||||
/// are full. Returns a guard that can be used to send a single request on the stream and await
|
||||
/// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
|
||||
/// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
|
||||
/// Acquires an available stream from the pool, or spins up a new stream if all streams are
|
||||
/// full. Returns a guard that can be used to send requests and await the responses. Blocks if
|
||||
/// the pool is full.
|
||||
///
|
||||
/// This is very performance-sensitive, as it is on the GetPage hot path.
|
||||
///
|
||||
/// TODO: this must do something more sophisticated for performance. We want:
|
||||
///
|
||||
/// * Cheap, concurrent access in the common case where we can use a pooled stream.
|
||||
/// * Quick acquisition of pooled streams with available capacity.
|
||||
/// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
|
||||
/// * Prefer filling up existing streams' queue depth before spinning up new streams.
|
||||
/// * Don't hold a lock while spinning up new streams.
|
||||
/// * Allow concurrent clients to join onto streams while they're spun up.
|
||||
/// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
|
||||
///
|
||||
/// For now, we just do something simple and functional, but very inefficient (linear scan).
|
||||
pub async fn get(&self) -> StreamGuard {
|
||||
let permit = self
|
||||
.limiter
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("never closed");
|
||||
let mut streams = self.streams.lock().unwrap();
|
||||
|
||||
// Look for a pooled stream with available capacity.
|
||||
for entry in streams.values() {
|
||||
assert!(
|
||||
entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
|
||||
"stream queue overflow"
|
||||
);
|
||||
if entry
|
||||
.queue_depth
|
||||
.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
|
||||
// Increment the queue depth via compare-and-swap.
|
||||
// TODO: review ordering.
|
||||
(queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
|
||||
})
|
||||
.is_ok()
|
||||
{
|
||||
return StreamGuard {
|
||||
sender: entry.sender.clone(),
|
||||
queue_depth: entry.queue_depth.clone(),
|
||||
permit,
|
||||
};
|
||||
}
|
||||
/// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
|
||||
/// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
|
||||
/// to free up higher-ordered channels.
|
||||
pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
|
||||
// No available stream, spin up a new one. We install the stream entry in the pool first and
|
||||
// return the guard, while spinning up the stream task async. This allows other callers to
|
||||
// join onto this stream and also create additional streams concurrently if this fills up.
|
||||
let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
|
||||
let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
|
||||
let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
|
||||
let entry = StreamEntry {
|
||||
sender: req_tx.clone(),
|
||||
queue_depth: queue_depth.clone(),
|
||||
};
|
||||
streams.insert(id, entry);
|
||||
// Fast path: acquire an idle stream from the pool.
|
||||
if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
|
||||
return Ok(StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
stream: Some(entry.stream),
|
||||
can_reuse: true,
|
||||
permit,
|
||||
});
|
||||
}
|
||||
|
||||
// NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT *
|
||||
// STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more
|
||||
// streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen
|
||||
// because we only acquire queue depth under lock and after acquiring a semaphore permit.
|
||||
assert!(streams.len() <= CLIENT_LIMIT, "stream overflow");
|
||||
// Spin up a new stream. Uses a watch channel to send a single request at a time, since
|
||||
// `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
|
||||
let mut client = self.client_pool.get().await?;
|
||||
|
||||
let client_pool = self.client_pool.clone();
|
||||
let streams = self.streams.clone();
|
||||
let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
|
||||
let req_stream = WatchStream::from_changes(req_rx);
|
||||
let resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = Self::run_stream(client_pool, req_rx).await {
|
||||
error!("stream failed: {err}");
|
||||
}
|
||||
// Remove stream from pool on exit.
|
||||
let entry = streams.lock().unwrap().remove(&id);
|
||||
assert!(entry.is_some(), "unknown stream ID: {id}");
|
||||
});
|
||||
|
||||
StreamGuard {
|
||||
sender: req_tx,
|
||||
queue_depth,
|
||||
Ok(StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
stream: Some(BiStream {
|
||||
client,
|
||||
sender: req_tx,
|
||||
receiver: Box::pin(resp_stream),
|
||||
}),
|
||||
can_reuse: true,
|
||||
permit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
|
||||
/// bidirectional GetPage stream, then forwards requests and responses between callers and the
|
||||
/// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
|
||||
/// atomic with pool stream acquisition.
|
||||
///
|
||||
/// The task exits when the request channel is closed, or on a stream error. The caller is
|
||||
/// responsible for removing the stream from the pool on exit.
|
||||
async fn run_stream(
|
||||
client_pool: Arc<ClientPool>,
|
||||
mut caller_rx: RequestReceiver,
|
||||
) -> anyhow::Result<()> {
|
||||
// Acquire a client from the pool and create a stream.
|
||||
let mut client = client_pool.get().await?;
|
||||
|
||||
let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
|
||||
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Track caller response channels by request ID. If the task returns early, these response
|
||||
// channels will be dropped and the waiting callers will receive an error.
|
||||
let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
|
||||
|
||||
// Process requests and responses.
|
||||
loop {
|
||||
// NB: this can trip if the server doesn't respond to a request, so only debug_assert.
|
||||
debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow");
|
||||
|
||||
tokio::select! {
|
||||
// Receive requests from callers and send them to the stream.
|
||||
req = caller_rx.recv() => {
|
||||
// Shut down if request channel is closed.
|
||||
let Some((req, resp_tx)) = req else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Store the response channel by request ID.
|
||||
if callers.contains_key(&req.request_id) {
|
||||
// Error on request ID duplicates. Ignore callers that went away.
|
||||
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
|
||||
format!("duplicate request ID: {}", req.request_id),
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
callers.insert(req.request_id, resp_tx);
|
||||
|
||||
// Send the request on the stream. Bail out if the send fails.
|
||||
req_tx.send(req).await.map_err(|_| {
|
||||
tonic::Status::unavailable("stream closed")
|
||||
})?;
|
||||
}
|
||||
|
||||
// Receive responses from the stream and send them to callers.
|
||||
resp = resp_stream.next() => {
|
||||
// Shut down if the stream is closed, and bail out on stream errors.
|
||||
let Some(resp) = resp.transpose()? else {
|
||||
return Ok(())
|
||||
};
|
||||
|
||||
// Send the response to the caller. Ignore errors if the caller went away.
|
||||
let Some(resp_tx) = callers.remove(&resp.request_id) else {
|
||||
warn!("received response for unknown request ID: {}", resp.request_id);
|
||||
continue;
|
||||
};
|
||||
_ = resp_tx.send(Ok(resp));
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
|
||||
/// depth. Queue depth is already reserved and will be returned on drop.
|
||||
impl Reapable for StreamPool {
|
||||
/// Reaps streams that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.retain(|_, entry| entry.idle_since >= cutoff);
|
||||
}
|
||||
}
|
||||
|
||||
/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
|
||||
/// in-flight requests on the stream, or the stream failed.
|
||||
pub struct StreamGuard {
|
||||
sender: RequestSender,
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
permit: OwnedSemaphorePermit,
|
||||
pool: Weak<StreamPool>,
|
||||
stream: Option<BiStream>, // Some until dropped
|
||||
can_reuse: bool, // returned to pool if true
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl StreamGuard {
|
||||
/// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
|
||||
/// valid for a single request (to enforce queue depth). This also drops the guard on return and
|
||||
/// returns the queue depth quota to the pool.
|
||||
/// Sends a request on the stream and awaits the response. If the future is dropped before it
|
||||
/// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
|
||||
/// request and is not returned to the pool. The same is true if the stream errors, in which
|
||||
/// case the caller can't send further requests on the stream.
|
||||
///
|
||||
/// The `GetPageRequest::request_id` must be unique across in-flight requests.
|
||||
/// We only support sending a single request at a time, to eliminate head-of-line blocking. See
|
||||
/// module documentation for details.
|
||||
///
|
||||
/// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
|
||||
/// to avoid tearing down the stream for per-request errors. Callers must check this.
|
||||
pub async fn send(
|
||||
self,
|
||||
&mut self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let (resp_tx, resp_rx) = oneshot::channel();
|
||||
let req_id = req.request_id;
|
||||
let stream = self.stream.as_mut().expect("not dropped");
|
||||
|
||||
self.sender
|
||||
.send((req, resp_tx))
|
||||
.await
|
||||
// Mark the stream as not reusable while the request is in flight. We can't return the
|
||||
// stream to the pool until we receive the response, to avoid head-of-line blocking and
|
||||
// stale responses. Failed streams can't be reused either.
|
||||
if !self.can_reuse {
|
||||
return Err(tonic::Status::internal("stream can't be reused"));
|
||||
}
|
||||
self.can_reuse = false;
|
||||
|
||||
// Send the request and receive the response.
|
||||
//
|
||||
// NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
|
||||
stream
|
||||
.sender
|
||||
.send(req)
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?;
|
||||
|
||||
resp_rx
|
||||
let resp = stream
|
||||
.receiver
|
||||
.next()
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?
|
||||
.ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
|
||||
|
||||
if resp.request_id != req_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID {} does not match request ID {}",
|
||||
resp.request_id, req_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Success, mark the stream as reusable.
|
||||
self.can_reuse = true;
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StreamGuard {
|
||||
fn drop(&mut self) {
|
||||
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
|
||||
// before the response is received, but that's okay.
|
||||
let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
|
||||
assert!(prev_queue_depth > 0, "stream queue underflow");
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
// If the stream isn't reusable, it can't be returned to the pool.
|
||||
if !self.can_reuse {
|
||||
return;
|
||||
}
|
||||
|
||||
// Place the idle stream back into the pool.
|
||||
let entry = StreamEntry {
|
||||
stream: self.stream.take().expect("dropped once"),
|
||||
idle_since: Instant::now(),
|
||||
};
|
||||
pool.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(entry.stream.client.id, entry);
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
|
||||
/// Periodically reaps idle resources from a pool.
|
||||
struct Reaper {
|
||||
/// The task check interval.
|
||||
interval: Duration,
|
||||
/// The threshold for reaping idle resources.
|
||||
threshold: Duration,
|
||||
/// Cancels the reaper task. Cancelled when the reaper is dropped.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Reaper {
|
||||
/// Creates a new reaper.
|
||||
pub fn new(threshold: Duration, interval: Duration) -> Self {
|
||||
Self {
|
||||
cancel: CancellationToken::new(),
|
||||
threshold,
|
||||
interval,
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawns a task to periodically reap idle resources from the given task pool. The task is
|
||||
/// cancelled when the reaper is dropped.
|
||||
pub fn spawn(&self, pool: &Arc<impl Reapable>) {
|
||||
// NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
|
||||
let pool = Arc::downgrade(pool);
|
||||
let cancel = self.cancel.clone();
|
||||
let (interval, threshold) = (self.interval, self.threshold);
|
||||
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(interval) => {
|
||||
let Some(pool) = pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
pool.reap_idle(Instant::now() - threshold);
|
||||
}
|
||||
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Reaper {
|
||||
fn drop(&mut self) {
|
||||
self.cancel.cancel(); // cancel reaper task
|
||||
}
|
||||
}
|
||||
|
||||
/// A reapable resource pool.
|
||||
trait Reapable: Send + Sync + 'static {
|
||||
/// Reaps resources that have been idle since before the given cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant);
|
||||
}
|
||||
|
||||
150
pageserver/client_grpc/src/retry.rs
Normal file
150
pageserver/client_grpc/src/retry.rs
Normal file
@@ -0,0 +1,150 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::future::pending;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
|
||||
/// A retry handler for Pageserver gRPC requests.
|
||||
///
|
||||
/// This is used instead of backoff::retry for better control and observability.
|
||||
pub struct Retry {
|
||||
/// Timeout across all retry attempts. If None, retries forever.
|
||||
pub timeout: Option<Duration>,
|
||||
/// The initial backoff duration. The first retry does not use a backoff.
|
||||
pub base_backoff: Duration,
|
||||
/// The maximum backoff duration.
|
||||
pub max_backoff: Duration,
|
||||
}
|
||||
|
||||
impl Retry {
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
|
||||
/// using the current tracing span for context.
|
||||
///
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`].
|
||||
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut(usize) -> O, // pass attempt number, starting at 0
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
let started = Instant::now();
|
||||
let deadline = self.timeout.map(|timeout| started + timeout);
|
||||
let mut last_error = None;
|
||||
let mut retries = 0;
|
||||
loop {
|
||||
// Set up a future to wait for the backoff, if any, and run the closure.
|
||||
let backoff_and_try = async {
|
||||
// NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
|
||||
// https://github.com/tokio-rs/tokio/issues/6866
|
||||
if let Some(backoff) = self.backoff_duration(retries) {
|
||||
tokio::time::sleep(backoff).await;
|
||||
}
|
||||
|
||||
f(retries).await
|
||||
};
|
||||
|
||||
// Set up a future for the timeout, if any.
|
||||
let timeout = async {
|
||||
match deadline {
|
||||
Some(deadline) => tokio::time::sleep_until(deadline).await,
|
||||
None => pending().await,
|
||||
}
|
||||
};
|
||||
|
||||
// Wait for the backoff and request, or bail out if the timeout is exceeded.
|
||||
let result = tokio::select! {
|
||||
result = backoff_and_try => result,
|
||||
|
||||
_ = timeout => {
|
||||
let last_error = last_error.unwrap_or_else(|| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
started.elapsed().as_secs_f64()
|
||||
))
|
||||
});
|
||||
error!(
|
||||
"giving up after {:.3}s and {retries} retries, last error {:?}: {}",
|
||||
started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
|
||||
);
|
||||
return Err(last_error);
|
||||
}
|
||||
};
|
||||
|
||||
match result {
|
||||
// Success, return the result.
|
||||
Ok(result) => {
|
||||
if retries > 0 {
|
||||
info!(
|
||||
"request succeeded after {retries} retries in {:.3}s",
|
||||
started.elapsed().as_secs_f64(),
|
||||
);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Error, retry or bail out.
|
||||
Err(status) => {
|
||||
let (code, message) = (status.code(), status.message());
|
||||
let attempt = retries + 1;
|
||||
|
||||
if !Self::should_retry(code) {
|
||||
// NB: include the attempt here too. This isn't necessarily the first
|
||||
// attempt, because the error may change between attempts.
|
||||
error!(
|
||||
"request failed with {code:?}: {message}, not retrying (attempt {attempt})"
|
||||
);
|
||||
return Err(status);
|
||||
}
|
||||
|
||||
warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
|
||||
|
||||
retries += 1;
|
||||
last_error = Some(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
|
||||
/// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
|
||||
fn backoff_duration(&self, retries: usize) -> Option<Duration> {
|
||||
let backoff = exponential_backoff_duration(
|
||||
(retries as u32).saturating_sub(1), // first retry does not back off
|
||||
self.base_backoff.as_secs_f64(),
|
||||
self.max_backoff.as_secs_f64(),
|
||||
);
|
||||
(!backoff.is_zero()).then_some(backoff)
|
||||
}
|
||||
|
||||
/// Returns true if the given status code should be retries.
|
||||
fn should_retry(code: tonic::Code) -> bool {
|
||||
match code {
|
||||
tonic::Code::Ok => panic!("unexpected Ok status code"),
|
||||
|
||||
// These codes are transient, so retry them.
|
||||
tonic::Code::Aborted => true,
|
||||
tonic::Code::Cancelled => true,
|
||||
tonic::Code::DeadlineExceeded => true, // maybe transient slowness
|
||||
tonic::Code::ResourceExhausted => true,
|
||||
tonic::Code::Unavailable => true,
|
||||
|
||||
// The following codes will like continue to fail, so don't retry.
|
||||
tonic::Code::AlreadyExists => false,
|
||||
tonic::Code::DataLoss => false,
|
||||
tonic::Code::FailedPrecondition => false,
|
||||
// NB: don't retry Internal. It is intended for serious errors such as invariant
|
||||
// violations, and is also used for client-side invariant checks that would otherwise
|
||||
// result in retry loops.
|
||||
tonic::Code::Internal => false,
|
||||
tonic::Code::InvalidArgument => false,
|
||||
tonic::Code::NotFound => false,
|
||||
tonic::Code::OutOfRange => false,
|
||||
tonic::Code::PermissionDenied => false,
|
||||
tonic::Code::Unauthenticated => false,
|
||||
tonic::Code::Unimplemented => false,
|
||||
tonic::Code::Unknown => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
209
pageserver/client_grpc/src/split.rs
Normal file
209
pageserver/client_grpc/src/split.rs
Normal file
@@ -0,0 +1,209 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
/// The response being assembled. Preallocated with empty pages, to be filled in.
|
||||
response: page_api::GetPageResponse,
|
||||
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
|
||||
/// to assemble the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
}
|
||||
|
||||
impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
pub fn for_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Option<ShardIndex> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Some(ShardIndex::unsharded());
|
||||
}
|
||||
|
||||
// Find the first page's shard, for comparison. If there are no pages, just return the first
|
||||
// shard (caller likely checked already, otherwise the server will reject it).
|
||||
let Some(&first_page) = req.block_numbers.first() else {
|
||||
return Some(ShardIndex::new(ShardNumber(0), count));
|
||||
};
|
||||
let key = rel_block_to_key(req.rel, first_page);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
|
||||
req.block_numbers
|
||||
.iter()
|
||||
.skip(1) // computed above
|
||||
.all(|&blkno| {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
key_to_shard_number(count, stripe_size, &key) == shard_number
|
||||
})
|
||||
.then_some(ShardIndex::new(shard_number, count))
|
||||
}
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Self {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::for_single_shard(&req, count, stripe_size).is_none(),
|
||||
"unnecessary request split"
|
||||
);
|
||||
|
||||
// Split the requests by shard index.
|
||||
let mut requests = HashMap::with_capacity(2); // common case
|
||||
let mut block_shards = Vec::with_capacity(req.block_numbers.len());
|
||||
for &blkno in &req.block_numbers {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
let shard_id = ShardIndex::new(shard_number, count);
|
||||
|
||||
requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
read_lsn: req.read_lsn,
|
||||
block_numbers: Vec::new(),
|
||||
})
|
||||
.block_numbers
|
||||
.push(blkno);
|
||||
block_shards.push(shard_id);
|
||||
}
|
||||
|
||||
// Construct a response to be populated by shard responses. Preallocate empty page slots
|
||||
// with the expected block numbers.
|
||||
let response = page_api::GetPageResponse {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
rel: req.rel,
|
||||
pages: req
|
||||
.block_numbers
|
||||
.into_iter()
|
||||
.map(|block_number| {
|
||||
page_api::Page {
|
||||
block_number,
|
||||
image: Bytes::new(), // empty page slot to be filled in
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
Self {
|
||||
requests,
|
||||
response,
|
||||
block_shards,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard. The response must match the request ID and have an OK
|
||||
/// status code. A response must not already exist for the given shard ID.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> tonic::Result<()> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
if response.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected non-OK response for shard {shard_id}: {} {}",
|
||||
response.status_code,
|
||||
response.reason.unwrap_or_default()
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Place the shard response pages into the assembled response, in request order.
|
||||
let mut pages = response.pages.into_iter();
|
||||
|
||||
for (i, &s) in self.block_shards.iter().enumerate() {
|
||||
if shard_id != s {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(slot) = self.response.pages.get_mut(i) else {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"no block_shards slot {i} for shard {shard_id}"
|
||||
)));
|
||||
};
|
||||
let Some(page) = pages.next() else {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"missing page {} in shard {shard_id} response",
|
||||
slot.block_number
|
||||
)));
|
||||
};
|
||||
if page.block_number != slot.block_number {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned wrong page at index {i}, expected {} got {}",
|
||||
slot.block_number, page.block_number
|
||||
)));
|
||||
}
|
||||
if !slot.image.is_empty() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned duplicate page {} at index {i}",
|
||||
slot.block_number
|
||||
)));
|
||||
}
|
||||
|
||||
*slot = page;
|
||||
}
|
||||
|
||||
// Make sure we've consumed all pages from the shard response.
|
||||
if let Some(extra_page) = pages.next() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned extra page: {}",
|
||||
extra_page.block_number
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches the final, assembled response.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Check that the response is complete.
|
||||
for (i, page) in self.response.pages.iter().enumerate() {
|
||||
if page.image.is_empty() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"missing page {} for shard {}",
|
||||
page.block_number,
|
||||
self.block_shards
|
||||
.get(i)
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.response)
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
85
pageserver/ctl/src/download_remote_object.rs
Normal file
85
pageserver/ctl/src/download_remote_object.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
/// Download a specific object from remote storage to a local file.
|
||||
///
|
||||
/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
|
||||
/// variable, in the same TOML format that the pageserver itself understands. This allows the
|
||||
/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
|
||||
/// Azure Blob Storage and local files), as long as the credentials are available via the
|
||||
/// standard environment variables expected by the underlying SDKs.
|
||||
///
|
||||
/// Examples for setting the environment variable:
|
||||
///
|
||||
/// ```bash
|
||||
/// # AWS S3 (region can also be provided via AWS_REGION)
|
||||
/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
|
||||
///
|
||||
/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
|
||||
/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
|
||||
/// ```
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct DownloadRemoteObjectCmd {
|
||||
/// Key / path of the object to download (relative to the remote storage prefix).
|
||||
///
|
||||
/// Examples:
|
||||
/// "wal/3aa8f.../00000001000000000000000A"
|
||||
/// "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
|
||||
pub remote_path: String,
|
||||
|
||||
/// Path of the local file to create. Existing file will be overwritten.
|
||||
///
|
||||
/// Examples:
|
||||
/// "./segment"
|
||||
/// "/tmp/layer_12345.parquet"
|
||||
pub output_file: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
|
||||
use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
|
||||
|
||||
// Fetch remote storage configuration from the environment
|
||||
let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
|
||||
anyhow::anyhow!(
|
||||
"'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
|
||||
)
|
||||
})?;
|
||||
|
||||
let config = RemoteStorageConfig::from_toml_str(&config_str)?;
|
||||
|
||||
// Initialise remote storage client
|
||||
let storage = GenericRemoteStorage::from_config(&config).await?;
|
||||
|
||||
// RemotePath must be relative – leading slashes confuse the parser.
|
||||
let remote_path_str = cmd.remote_path.trim_start_matches('/');
|
||||
let remote_path = RemotePath::from_string(remote_path_str)?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
println!(
|
||||
"Downloading '{remote_path}' from remote storage bucket {:?} ...",
|
||||
config.storage.bucket_name()
|
||||
);
|
||||
|
||||
// Start the actual download
|
||||
let download = storage
|
||||
.download(&remote_path, &DownloadOpts::default(), &cancel)
|
||||
.await?;
|
||||
|
||||
// Stream to file
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let tmp_path = cmd.output_file.with_extension("tmp");
|
||||
let mut file = tokio::fs::File::create(&tmp_path).await?;
|
||||
tokio::io::copy(&mut reader, &mut file).await?;
|
||||
file.sync_all().await?;
|
||||
// Atomically move into place
|
||||
tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
|
||||
|
||||
println!(
|
||||
"Downloaded to '{}'. Last modified: {:?}, etag: {}",
|
||||
cmd.output_file, download.last_modified, download.etag
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,14 +1,16 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{Context, Ok};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::{
|
||||
IndexPart,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
remote_timeline_client::remote_layer_path,
|
||||
storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
|
||||
remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
|
||||
storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
|
||||
};
|
||||
use pageserver_api::key::Key;
|
||||
use serde::Serialize;
|
||||
use std::collections::BTreeMap;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
|
||||
#[arg(long)]
|
||||
lsn: String,
|
||||
},
|
||||
/// List all visible delta and image layers at the latest LSN.
|
||||
ListVisibleLayers {
|
||||
#[arg(long)]
|
||||
path: Utf8PathBuf,
|
||||
},
|
||||
}
|
||||
|
||||
fn create_layer_map_from_index_part(
|
||||
index_part: &IndexPart,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (key, value) in index_part.layer_metadata.iter() {
|
||||
updates.insert_historic(PersistentLayerDesc::from_filename(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key.clone(),
|
||||
value.file_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
layer_map
|
||||
}
|
||||
|
||||
async fn search_layers(
|
||||
@@ -49,18 +76,7 @@ async fn search_layers(
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
IndexPart::from_json_bytes(&bytes).unwrap()
|
||||
};
|
||||
let mut layer_map = LayerMap::default();
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (key, value) in index_json.layer_metadata.iter() {
|
||||
updates.insert_historic(PersistentLayerDesc::from_filename(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key.clone(),
|
||||
value.file_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
|
||||
let key = Key::from_hex(key)?;
|
||||
|
||||
let lsn = Lsn::from_str(lsn).unwrap();
|
||||
@@ -98,6 +114,69 @@ async fn search_layers(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
struct VisibleLayers {
|
||||
pub total_images: u64,
|
||||
pub total_image_bytes: u64,
|
||||
pub total_deltas: u64,
|
||||
pub total_delta_bytes: u64,
|
||||
pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
|
||||
}
|
||||
|
||||
impl VisibleLayers {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
layer_metadata: BTreeMap::new(),
|
||||
total_images: 0,
|
||||
total_image_bytes: 0,
|
||||
total_deltas: 0,
|
||||
total_delta_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
|
||||
match name {
|
||||
LayerName::Image(_) => {
|
||||
self.total_images += 1;
|
||||
self.total_image_bytes += layer.file_size;
|
||||
}
|
||||
LayerName::Delta(_) => {
|
||||
self.total_deltas += 1;
|
||||
self.total_delta_bytes += layer.file_size;
|
||||
}
|
||||
}
|
||||
self.layer_metadata.insert(name, layer);
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||
let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
|
||||
let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
|
||||
let mut visible_layers = VisibleLayers::new();
|
||||
let (layers, _key_space) = layer_map.get_visibility(Vec::new());
|
||||
for (layer, visibility) in layers {
|
||||
if visibility == LayerVisibilityHint::Visible {
|
||||
visible_layers.add_layer(
|
||||
layer.layer_name(),
|
||||
index_part
|
||||
.layer_metadata
|
||||
.get(&layer.layer_name())
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
|
||||
println!("{output}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
match cmd {
|
||||
IndexPartCmd::Dump { path } => {
|
||||
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
key,
|
||||
lsn,
|
||||
} => search_layers(tenant_id, timeline_id, path, key, lsn).await,
|
||||
IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//!
|
||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||
|
||||
mod download_remote_object;
|
||||
mod draw_timeline_dir;
|
||||
mod index_part;
|
||||
mod key;
|
||||
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{Parser, Subcommand};
|
||||
use download_remote_object::DownloadRemoteObjectCmd;
|
||||
use index_part::IndexPartCmd;
|
||||
use layers::LayerCmd;
|
||||
use page_trace::PageTraceCmd;
|
||||
@@ -63,6 +65,7 @@ enum Commands {
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
DownloadRemoteObject(DownloadRemoteObjectCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
Commands::DownloadRemoteObject(cmd) => {
|
||||
download_remote_object::main(&cmd).await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
|
||||
message GetPageRequest {
|
||||
// A request ID. Will be included in the response. Should be unique for
|
||||
// in-flight requests on the stream.
|
||||
uint64 request_id = 1;
|
||||
RequestID request_id = 1;
|
||||
// The request class.
|
||||
GetPageClass request_class = 2;
|
||||
// The LSN to read at.
|
||||
@@ -177,6 +177,14 @@ message GetPageRequest {
|
||||
repeated uint32 block_number = 5;
|
||||
}
|
||||
|
||||
// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
|
||||
message RequestID {
|
||||
// The base request ID.
|
||||
uint64 id = 1;
|
||||
// The request attempt. Starts at 0, incremented on each retry.
|
||||
uint32 attempt = 2;
|
||||
}
|
||||
|
||||
// A GetPageRequest class. Primarily intended for observability, but may also be
|
||||
// used for prioritization in the future.
|
||||
enum GetPageClass {
|
||||
@@ -199,13 +207,26 @@ enum GetPageClass {
|
||||
// the entire batch is ready, so no one can make use of the individual pages.
|
||||
message GetPageResponse {
|
||||
// The original request's ID.
|
||||
uint64 request_id = 1;
|
||||
// The response status code.
|
||||
RequestID request_id = 1;
|
||||
// The response status code. If not OK, the rel and page fields will be empty.
|
||||
GetPageStatusCode status_code = 2;
|
||||
// A string describing the status, if any.
|
||||
string reason = 3;
|
||||
// The 8KB page images, in the same order as the request. Empty if status_code != OK.
|
||||
repeated bytes page_image = 4;
|
||||
// The relation that the pages belong to.
|
||||
RelTag rel = 4;
|
||||
// The page(s), in the same order as the request.
|
||||
repeated Page page = 5;
|
||||
}
|
||||
|
||||
// A page.
|
||||
//
|
||||
// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
|
||||
// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
|
||||
message Page {
|
||||
// The page number.
|
||||
uint32 block_number = 1;
|
||||
// The materialized page image, as an 8KB byte vector.
|
||||
bytes image = 2;
|
||||
}
|
||||
|
||||
// A GetPageResponse status code.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::Context as _;
|
||||
use futures::future::ready;
|
||||
use futures::{Stream, StreamExt as _, TryStreamExt as _};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio_util::io::StreamReader;
|
||||
@@ -110,7 +111,7 @@ impl Client {
|
||||
) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
|
||||
let reqs = reqs.map(proto::GetPageRequest::from);
|
||||
let resps = self.inner.get_pages(reqs).await?.into_inner();
|
||||
Ok(resps.map_ok(GetPageResponse::from))
|
||||
Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
|
||||
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
|
||||
}
|
||||
|
||||
/// The LSN a request should read at.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct ReadLsn {
|
||||
/// The request's read LSN.
|
||||
pub request_lsn: Lsn,
|
||||
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
|
||||
}
|
||||
|
||||
/// Requests one or more pages.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct GetPageRequest {
|
||||
/// A request ID. Will be included in the response. Should be unique for in-flight requests on
|
||||
/// the stream.
|
||||
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
return Err(ProtocolError::Missing("block_number"));
|
||||
}
|
||||
Ok(Self {
|
||||
request_id: pb.request_id,
|
||||
request_id: pb
|
||||
.request_id
|
||||
.ok_or(ProtocolError::Missing("request_id"))?
|
||||
.into(),
|
||||
request_class: pb.request_class.into(),
|
||||
read_lsn: pb
|
||||
.read_lsn
|
||||
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
fn from(request: GetPageRequest) -> Self {
|
||||
Self {
|
||||
request_id: request.request_id,
|
||||
request_id: Some(request.request_id.into()),
|
||||
request_class: request.request_class.into(),
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
rel: Some(request.rel.into()),
|
||||
@@ -380,16 +383,60 @@ impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage request ID.
|
||||
pub type RequestID = u64;
|
||||
/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
pub struct RequestID {
|
||||
/// The base request ID.
|
||||
pub id: u64,
|
||||
// The request attempt. Starts at 0, incremented on each retry.
|
||||
pub attempt: u32,
|
||||
}
|
||||
|
||||
impl RequestID {
|
||||
/// Creates a new RequestID with the given ID and an initial attempt of 0.
|
||||
pub fn new(id: u64) -> Self {
|
||||
Self { id, attempt: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for RequestID {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}.{}", self.id, self.attempt)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<proto::RequestId> for RequestID {
|
||||
fn from(pb: proto::RequestId) -> Self {
|
||||
Self {
|
||||
id: pb.id,
|
||||
attempt: pb.attempt,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for RequestID {
|
||||
fn from(id: u64) -> Self {
|
||||
Self::new(id)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RequestID> for proto::RequestId {
|
||||
fn from(request_id: RequestID) -> Self {
|
||||
Self {
|
||||
id: request_id.id,
|
||||
attempt: request_id.attempt,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage request class.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
|
||||
pub enum GetPageClass {
|
||||
/// Unknown class. For backwards compatibility: used when an older client version sends a class
|
||||
/// that a newer server version has removed.
|
||||
Unknown,
|
||||
/// A normal request. This is the default.
|
||||
#[default]
|
||||
Normal,
|
||||
/// A prefetch request. NB: can only be classified on pg < 18.
|
||||
Prefetch,
|
||||
@@ -443,32 +490,41 @@ impl From<GetPageClass> for i32 {
|
||||
pub struct GetPageResponse {
|
||||
/// The original request's ID.
|
||||
pub request_id: RequestID,
|
||||
/// The response status code.
|
||||
/// The response status code. If not OK, the `rel` and `pages` fields will be empty.
|
||||
pub status_code: GetPageStatusCode,
|
||||
/// A string describing the status, if any.
|
||||
pub reason: Option<String>,
|
||||
/// The 8KB page images, in the same order as the request. Empty if status != OK.
|
||||
pub page_images: Vec<Bytes>,
|
||||
/// The relation that the pages belong to.
|
||||
pub rel: RelTag,
|
||||
// The page(s), in the same order as the request.
|
||||
pub pages: Vec<Page>,
|
||||
}
|
||||
|
||||
impl From<proto::GetPageResponse> for GetPageResponse {
|
||||
fn from(pb: proto::GetPageResponse) -> Self {
|
||||
Self {
|
||||
request_id: pb.request_id,
|
||||
impl TryFrom<proto::GetPageResponse> for GetPageResponse {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
|
||||
Ok(Self {
|
||||
request_id: pb
|
||||
.request_id
|
||||
.ok_or(ProtocolError::Missing("request_id"))?
|
||||
.into(),
|
||||
status_code: pb.status_code.into(),
|
||||
reason: Some(pb.reason).filter(|r| !r.is_empty()),
|
||||
page_images: pb.page_image,
|
||||
}
|
||||
rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
|
||||
pages: pb.page.into_iter().map(Page::from).collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetPageResponse> for proto::GetPageResponse {
|
||||
fn from(response: GetPageResponse) -> Self {
|
||||
Self {
|
||||
request_id: response.request_id,
|
||||
request_id: Some(response.request_id.into()),
|
||||
status_code: response.status_code.into(),
|
||||
reason: response.reason.unwrap_or_default(),
|
||||
page_image: response.page_images,
|
||||
rel: Some(response.rel.into()),
|
||||
page: response.pages.into_iter().map(proto::Page::from).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -501,11 +557,39 @@ impl GetPageResponse {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
rel: RelTag::default(),
|
||||
pages: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// A page.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Page {
|
||||
/// The page number.
|
||||
pub block_number: u32,
|
||||
/// The materialized page image, as an 8KB byte vector.
|
||||
pub image: Bytes,
|
||||
}
|
||||
|
||||
impl From<proto::Page> for Page {
|
||||
fn from(pb: proto::Page) -> Self {
|
||||
Self {
|
||||
block_number: pb.block_number,
|
||||
image: pb.image,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Page> for proto::Page {
|
||||
fn from(page: Page) -> Self {
|
||||
Self {
|
||||
block_number: page.block_number,
|
||||
image: page.image,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage response status code.
|
||||
///
|
||||
/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
|
||||
@@ -602,6 +686,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetPageStatusCode> for tonic::Code {
|
||||
fn from(status_code: GetPageStatusCode) -> Self {
|
||||
use tonic::Code;
|
||||
|
||||
match status_code {
|
||||
GetPageStatusCode::Unknown => Code::Unknown,
|
||||
GetPageStatusCode::Ok => Code::Ok,
|
||||
GetPageStatusCode::NotFound => Code::NotFound,
|
||||
GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
|
||||
GetPageStatusCode::InternalError => Code::Internal,
|
||||
GetPageStatusCode::SlowDown => Code::ResourceExhausted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -16,6 +16,7 @@ futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -27,8 +28,9 @@ tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
pageserver_client.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -10,12 +10,14 @@ use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{Stream, StreamExt as _};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client_grpc::{self as client_grpc, ShardSpec};
|
||||
use pageserver_page_api as page_api;
|
||||
use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -37,6 +39,10 @@ pub(crate) struct Args {
|
||||
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
/// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
|
||||
/// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
|
||||
#[clap(long)]
|
||||
rich_client: bool,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
@@ -332,6 +338,7 @@ async fn main_impl(
|
||||
let client: Box<dyn Client> = match scheme.as_str() {
|
||||
"postgresql" | "postgres" => {
|
||||
assert!(!args.compression, "libpq does not support compression");
|
||||
assert!(!args.rich_client, "rich client requires grpc://");
|
||||
Box::new(
|
||||
LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
|
||||
.await
|
||||
@@ -339,6 +346,16 @@ async fn main_impl(
|
||||
)
|
||||
}
|
||||
|
||||
"grpc" if args.rich_client => Box::new(
|
||||
RichGrpcClient::new(
|
||||
&args.page_service_connstring,
|
||||
worker_id.timeline,
|
||||
args.compression,
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
|
||||
"grpc" => Box::new(
|
||||
GrpcClient::new(
|
||||
&args.page_service_connstring,
|
||||
@@ -657,7 +674,7 @@ impl Client for GrpcClient {
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = page_api::GetPageRequest {
|
||||
request_id: req_id,
|
||||
request_id: req_id.into(),
|
||||
request_class: page_api::GetPageClass::Normal,
|
||||
read_lsn: page_api::ReadLsn {
|
||||
request_lsn: req_lsn,
|
||||
@@ -677,6 +694,79 @@ impl Client for GrpcClient {
|
||||
"unexpected status code: {}",
|
||||
resp.status_code,
|
||||
);
|
||||
Ok((resp.request_id, resp.page_images))
|
||||
Ok((
|
||||
resp.request_id.id,
|
||||
resp.pages.into_iter().map(|p| p.image).collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// A rich gRPC Pageserver client.
|
||||
struct RichGrpcClient {
|
||||
inner: Arc<client_grpc::PageserverClient>,
|
||||
requests: FuturesUnordered<
|
||||
Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl RichGrpcClient {
|
||||
async fn new(
|
||||
connstring: &str,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let inner = Arc::new(client_grpc::PageserverClient::new(
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardSpec::new(
|
||||
[(ShardIndex::unsharded(), connstring.to_string())].into(),
|
||||
None,
|
||||
)?,
|
||||
None,
|
||||
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
|
||||
)?);
|
||||
Ok(Self {
|
||||
inner,
|
||||
requests: FuturesUnordered::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for RichGrpcClient {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = page_api::GetPageRequest {
|
||||
request_id: req_id.into(),
|
||||
request_class: page_api::GetPageClass::Normal,
|
||||
read_lsn: page_api::ReadLsn {
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since_lsn: Some(mod_lsn),
|
||||
},
|
||||
rel,
|
||||
block_numbers: blks,
|
||||
};
|
||||
let inner = self.inner.clone();
|
||||
self.requests.push(Box::pin(async move {
|
||||
inner
|
||||
.get_page(req)
|
||||
.await
|
||||
.map_err(|err| anyhow::anyhow!("{err}"))
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.requests.next().await.unwrap()?;
|
||||
Ok((
|
||||
resp.request_id.id,
|
||||
resp.pages.into_iter().map(|p| p.image).collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
127
pageserver/pagebench/src/cmd/idle_streams.rs
Normal file
127
pageserver/pagebench/src/cmd/idle_streams.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use futures::StreamExt;
|
||||
use tonic::transport::Endpoint;
|
||||
use tracing::info;
|
||||
|
||||
use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// Starts a large number of idle gRPC GetPage streams.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
/// The Pageserver to connect to. Must use grpc://.
|
||||
#[clap(long, default_value = "grpc://localhost:51051")]
|
||||
server: String,
|
||||
/// The Pageserver HTTP API.
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
http_server: String,
|
||||
/// The number of streams to open.
|
||||
#[clap(long, default_value = "100000")]
|
||||
count: usize,
|
||||
/// Number of streams per connection.
|
||||
#[clap(long, default_value = "100")]
|
||||
per_connection: usize,
|
||||
/// Send a single GetPage request on each stream.
|
||||
#[clap(long, default_value_t = false)]
|
||||
send_request: bool,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
rt.block_on(main_impl(args))
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
// Discover a tenant and timeline to use.
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
reqwest::Client::new(),
|
||||
args.http_server.clone(),
|
||||
None,
|
||||
));
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: Some(1),
|
||||
targets: None,
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let ttid = timelines
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("no timelines found"))?;
|
||||
|
||||
// Set up the initial client.
|
||||
let endpoint = Endpoint::from_shared(args.server.clone())?;
|
||||
|
||||
let connect = async || {
|
||||
pageserver_page_api::Client::new(
|
||||
endpoint.connect().await?,
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardIndex::unsharded(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
};
|
||||
|
||||
let mut client = connect().await?;
|
||||
let mut streams = Vec::with_capacity(args.count);
|
||||
|
||||
// Create streams.
|
||||
for i in 0..args.count {
|
||||
if i % 100 == 0 {
|
||||
info!("opened {}/{} streams", i, args.count);
|
||||
}
|
||||
if i % args.per_connection == 0 && i > 0 {
|
||||
client = connect().await?;
|
||||
}
|
||||
|
||||
let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Send request if specified.
|
||||
if args.send_request {
|
||||
req_tx.send(GetPageRequest {
|
||||
request_id: 1.into(),
|
||||
request_class: GetPageClass::Normal,
|
||||
read_lsn: ReadLsn {
|
||||
request_lsn: Lsn::MAX,
|
||||
not_modified_since_lsn: Some(Lsn(1)),
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: 1664, // pg_global
|
||||
dbnode: 0, // shared database
|
||||
relnode: 1262, // pg_authid
|
||||
forknum: 0, // init
|
||||
},
|
||||
block_numbers: vec![0],
|
||||
})?;
|
||||
let resp = resp_stream
|
||||
.next()
|
||||
.await
|
||||
.transpose()?
|
||||
.ok_or_else(|| anyhow!("no response"))?;
|
||||
if resp.status_code != GetPageStatusCode::Ok {
|
||||
return Err(anyhow!("{} response", resp.status_code));
|
||||
}
|
||||
}
|
||||
|
||||
// Hold onto streams to avoid closing them.
|
||||
streams.push((req_tx, resp_stream));
|
||||
}
|
||||
|
||||
info!("opened {} streams, sleeping", args.count);
|
||||
|
||||
// Block forever, to hold the idle streams open for inspection.
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::fs::File;
|
||||
|
||||
use clap::Parser;
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
@@ -17,38 +20,73 @@ mod cmd {
|
||||
pub(super) mod aux_files;
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod idle_streams;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
struct Args {
|
||||
/// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
|
||||
/// written, e.g. via --runtime.
|
||||
#[arg(long)]
|
||||
profile: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
subcommand: Subcommand,
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
enum Subcommand {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
AuxFiles(cmd::aux_files::Args),
|
||||
IdleStreams(cmd::idle_streams::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
|
||||
// Start a CPU profile if requested.
|
||||
let mut profiler = None;
|
||||
if args.profile {
|
||||
profiler = Some(
|
||||
pprof::ProfilerGuardBuilder::default()
|
||||
.frequency(1000)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()?,
|
||||
);
|
||||
}
|
||||
|
||||
match args.subcommand {
|
||||
Subcommand::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Subcommand::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
|
||||
}?;
|
||||
|
||||
// Generate a CPU flamegraph if requested.
|
||||
if let Some(profiler) = profiler {
|
||||
let report = profiler.report().build()?;
|
||||
drop(profiler); // stop profiling
|
||||
let file = File::create("profile.svg")?;
|
||||
report.flamegraph(file)?;
|
||||
info!("wrote CPU profile flamegraph to profile.svg")
|
||||
}
|
||||
.unwrap()
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ where
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
// "zenith.signal" file, so that postgres can read it during startup.
|
||||
// "neon.signal" file, so that postgres can read it during startup.
|
||||
//
|
||||
// We don't keep full history of record boundaries in the page server,
|
||||
// however, only the predecessor of the latest record on each
|
||||
@@ -751,34 +751,39 @@ where
|
||||
|
||||
//
|
||||
// Add generated pg_control file and bootstrap WAL segment.
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
// Also send neon.signal and zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
async fn add_pgcontrol_file(
|
||||
&mut self,
|
||||
pg_control_bytes: Bytes,
|
||||
system_identifier: u64,
|
||||
) -> Result<(), BasebackupError> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
// add neon.signal file
|
||||
let mut neon_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.timeline.is_ancestor_lsn(self.lsn) {
|
||||
write!(zenith_signal, "PREV LSN: none")
|
||||
write!(neon_signal, "PREV LSN: none")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")
|
||||
write!(neon_signal, "PREV LSN: invalid")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
|
||||
|
||||
// TODO: Remove zenith.signal once all historical computes have been replaced
|
||||
// ... and thus support the neon.signal file.
|
||||
for signalfilename in ["neon.signal", "zenith.signal"] {
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header(signalfilename, neon_signal.len() as u64)?,
|
||||
neon_signal.as_bytes(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
|
||||
}
|
||||
|
||||
//send pg_control
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
|
||||
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
|
||||
};
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
|
||||
MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -763,6 +764,41 @@ fn start_pageserver(
|
||||
(http_task, https_task)
|
||||
};
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let metrics_collection_task = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
let background_jobs_barrier = background_jobs_barrier.clone();
|
||||
async move {
|
||||
if conf.force_metric_collection_on_scrape {
|
||||
return;
|
||||
}
|
||||
|
||||
// first wait until background jobs are cleared to launch.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return; },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("cancelled metrics collection task, exiting...");
|
||||
break;
|
||||
},
|
||||
_ = interval.tick() => {}
|
||||
}
|
||||
tokio::task::spawn_blocking(|| {
|
||||
METRICS_COLLECTOR.run_once(true);
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
MetricsCollectionTask(CancellableTask { task, cancel })
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
@@ -844,6 +880,7 @@ fn start_pageserver(
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
page_service_grpc,
|
||||
metrics_collection_task,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
@@ -880,17 +917,15 @@ async fn create_remote_storage_client(
|
||||
// If `test_remote_failures` is non-zero, wrap the client with a
|
||||
// wrapper that simulates failures.
|
||||
if conf.test_remote_failures > 0 {
|
||||
if !cfg!(feature = "testing") {
|
||||
anyhow::bail!(
|
||||
"test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
|
||||
);
|
||||
}
|
||||
info!(
|
||||
"Simulating remote failures for first {} attempts of each op",
|
||||
conf.test_remote_failures
|
||||
);
|
||||
remote_storage =
|
||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
||||
remote_storage = GenericRemoteStorage::unreliable_wrapper(
|
||||
remote_storage,
|
||||
conf.test_remote_failures,
|
||||
conf.test_remote_failures_probability,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(remote_storage)
|
||||
|
||||
@@ -147,7 +147,11 @@ pub struct PageServerConf {
|
||||
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
|
||||
// The number of allowed failures in remote storage operations.
|
||||
pub test_remote_failures: u64,
|
||||
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
|
||||
// Use 100 for 100% failure, 0 for no failure.
|
||||
pub test_remote_failures_probability: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
|
||||
@@ -248,6 +252,14 @@ pub struct PageServerConf {
|
||||
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
|
||||
|
||||
pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
|
||||
|
||||
/// Defines what is a big tenant for the purpose of image layer generation.
|
||||
/// See Timeline::should_check_if_image_layers_required
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
|
||||
/// Controls whether to collect all metrics on each scrape or to return potentially stale
|
||||
/// results.
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -392,6 +404,7 @@ impl PageServerConf {
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
@@ -427,6 +440,8 @@ impl PageServerConf {
|
||||
posthog_config,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -461,6 +476,7 @@ impl PageServerConf {
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api: control_plane_api
|
||||
@@ -484,6 +500,8 @@ impl PageServerConf {
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
|
||||
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
availability_zone_id: az_id.expect("Checked above"),
|
||||
node_ip_addr: None,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! The validator is responsible for validating DeletionLists for execution,
|
||||
//! based on whethe the generation in the DeletionList is still the latest
|
||||
//! based on whether the generation in the DeletionList is still the latest
|
||||
//! generation for a tenant.
|
||||
//!
|
||||
//! The purpose of validation is to ensure split-brain safety in the cluster
|
||||
|
||||
@@ -409,11 +409,12 @@ impl TenantFeatureResolver {
|
||||
|
||||
/// Refresh the cached properties and flags on the critical path.
|
||||
pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = None;
|
||||
let mut remote_size_mb = Some(0.0);
|
||||
for timeline in tenant_shard.list_timelines() {
|
||||
let size = timeline.metrics.resident_physical_size_get();
|
||||
if size == 0 {
|
||||
remote_size_mb = None;
|
||||
break;
|
||||
}
|
||||
if let Some(ref mut remote_size_mb) = remote_size_mb {
|
||||
*remote_size_mb += size as f64 / 1024.0 / 1024.0;
|
||||
|
||||
@@ -116,26 +116,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -618,7 +598,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -685,6 +665,17 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
@@ -767,7 +758,7 @@ paths:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
/v1/tenant:
|
||||
get:
|
||||
description: Get tenants list
|
||||
responses:
|
||||
@@ -847,7 +838,7 @@ paths:
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
/v1/tenant/{tenant_id}/config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
|
||||
@@ -2,12 +2,15 @@
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use futures::future::join_all;
|
||||
use futures::{StreamExt, TryFutureExt};
|
||||
@@ -44,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
@@ -55,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
@@ -75,12 +80,13 @@ use crate::tenant::remote_timeline_client::{
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::ValuesReconstructState;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
|
||||
CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
|
||||
WaitLsnWaiter, import_pgdata,
|
||||
};
|
||||
use crate::tenant::{
|
||||
GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
|
||||
@@ -395,6 +401,7 @@ async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
force_await_initial_logical_size: bool,
|
||||
include_image_consistent_lsn: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -419,6 +426,10 @@ async fn build_timeline_info(
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
// HADRON
|
||||
if include_image_consistent_lsn {
|
||||
info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
|
||||
}
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
@@ -508,6 +519,8 @@ async fn build_timeline_info_common(
|
||||
is_invisible: Some(is_invisible),
|
||||
|
||||
walreceiver_status,
|
||||
// HADRON
|
||||
image_consistent_lsn: None,
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -710,6 +723,8 @@ async fn timeline_list_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -730,6 +745,7 @@ async fn timeline_list_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
@@ -758,6 +774,9 @@ async fn timeline_and_offloaded_list_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -778,6 +797,7 @@ async fn timeline_and_offloaded_list_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
@@ -962,6 +982,9 @@ async fn timeline_detail_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
// HADRON
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Logical size calculation needs downloading.
|
||||
@@ -982,6 +1005,7 @@ async fn timeline_detail_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2500,9 +2524,10 @@ async fn timeline_checkpoint_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e),
|
||||
if e.is_cancel() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e.into_anyhow())
|
||||
}
|
||||
)?;
|
||||
}
|
||||
@@ -2687,6 +2712,16 @@ async fn deletion_queue_flush(
|
||||
}
|
||||
}
|
||||
|
||||
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct GetPageResponse {
|
||||
pub page: Bytes,
|
||||
pub layers_visited: u32,
|
||||
pub delta_layers_visited: u32,
|
||||
pub records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
async fn getpage_at_lsn_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
@@ -2737,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(
|
||||
|
||||
// Use last_record_lsn if no lsn is provided
|
||||
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||
|
||||
if touch {
|
||||
json_response(StatusCode::OK, ())
|
||||
} else {
|
||||
Result::<_, ApiError>::Ok(
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(page))
|
||||
.unwrap(),
|
||||
)
|
||||
let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
|
||||
let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
|
||||
let response = GetPageResponse {
|
||||
page,
|
||||
layers_visited: reconstruct_state.get_layers_visited(),
|
||||
delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
|
||||
records: reconstruct_state.debug_state.records.clone(),
|
||||
img: reconstruct_state.debug_state.img.clone(),
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -3213,6 +3251,30 @@ async fn get_utilization(
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
/// HADRON
|
||||
async fn list_tenant_visible_size_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let mut map = BTreeMap::new();
|
||||
for (tenant_shard_id, slot) in state.tenant_manager.list() {
|
||||
match slot {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let visible_size = tenant.get_visible_size();
|
||||
map.insert(tenant_shard_id, visible_size);
|
||||
}
|
||||
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, map)
|
||||
}
|
||||
|
||||
async fn list_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3616,6 +3678,7 @@ async fn activate_post_import_handler(
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline, false, // include_non_incremental_logical_size,
|
||||
false, // force_await_initial_logical_size
|
||||
false, // include_image_consistent_lsn
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3691,6 +3754,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn force_refresh_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant
|
||||
.feature_resolver
|
||||
.refresh_properties_and_flags(&tenant);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_evaluate_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3920,9 +4000,14 @@ pub fn make_router(
|
||||
.expect("construct launch timestamp header middleware"),
|
||||
);
|
||||
|
||||
let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
|
||||
|
||||
let prometheus_metrics_handler_wrapper =
|
||||
move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
|
||||
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
.get("/profile/heap", |r| request_span(r, profile_heap_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
@@ -4115,7 +4200,7 @@ pub fn make_router(
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|
||||
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
|
||||
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
|
||||
@@ -4128,6 +4213,7 @@ pub fn make_router(
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
|
||||
@@ -4156,6 +4242,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
|
||||
api_handler(r, force_refresh_feature_flag)
|
||||
})
|
||||
.put("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
|
||||
})
|
||||
|
||||
@@ -610,13 +610,13 @@ async fn import_file(
|
||||
debug!("imported twophase file");
|
||||
} else if file_path.starts_with("pg_wal") {
|
||||
debug!("found wal file in base section. ignore it");
|
||||
} else if file_path.starts_with("zenith.signal") {
|
||||
} else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
|
||||
// Parse zenith signal file to set correct previous LSN
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||
// neon.signal format is "PREV LSN: prev_lsn"
|
||||
// TODO write serialization and deserialization in the same place.
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match zenith_signal {
|
||||
let neon_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match neon_signal {
|
||||
"PREV LSN: none" => Lsn(0),
|
||||
"PREV LSN: invalid" => Lsn(0),
|
||||
other => {
|
||||
@@ -624,17 +624,17 @@ async fn import_file(
|
||||
split[1]
|
||||
.trim()
|
||||
.parse::<Lsn>()
|
||||
.context("can't parse zenith.signal")?
|
||||
.context("can't parse neon.signal")?
|
||||
}
|
||||
};
|
||||
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// neon.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.writer().await;
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
debug!("imported zenith signal {}", prev_lsn);
|
||||
debug!("imported neon signal {}", prev_lsn);
|
||||
} else if file_path.starts_with("pg_tblspc") {
|
||||
// TODO Backups exported from neon won't have pg_tblspc, but we will need
|
||||
// this to import arbitrary postgres databases.
|
||||
|
||||
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct HttpsEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
// HADRON
|
||||
pub struct MetricsCollectionTask(pub CancellableTask);
|
||||
|
||||
impl CancellableTask {
|
||||
pub async fn shutdown(self) {
|
||||
self.cancel.cancel();
|
||||
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
grpc_task: Option<CancellableTask>,
|
||||
metrics_collection_task: MetricsCollectionTask,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
// HADRON
|
||||
timed(
|
||||
metrics_collection_task.0.shutdown(),
|
||||
"shutdown metrics collections metrics",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
consumption_metrics_worker.0.shutdown(),
|
||||
"shutdown consumption metrics",
|
||||
|
||||
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
|
||||
// - success
|
||||
// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
|
||||
// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
|
||||
pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_pagestream_handler_results_total",
|
||||
"Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
|
||||
&["outcome"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Constants for pageserver_pagestream_handler_results_total's outcome labels
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
//
|
||||
// We collect the time spent in actual WAL redo ('redo'), and time waiting
|
||||
|
||||
@@ -70,7 +70,7 @@ use crate::context::{
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{LsnRange, Version};
|
||||
use crate::span::{
|
||||
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
|
||||
let (response_msg, ctx) = match handler_result {
|
||||
Err(e) => match &e.err {
|
||||
PageStreamError::Shutdown => {
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
|
||||
.inc();
|
||||
// END HADRON
|
||||
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
PageStreamError::Reconnect(reason) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
PageStreamError::Reconnect(_reason) => {
|
||||
span.in_scope(|| {
|
||||
// BEGIN HADRON
|
||||
// We can get here because the compute node is pointing at the wrong PS. We
|
||||
// already have a metric to keep track of this so suppressing this log to
|
||||
// reduce log spam. The information in this log message is not going to be that
|
||||
// helpful given the volume of logs that can be generated.
|
||||
// info!("handler requested reconnect: {reason}")
|
||||
// END HADRON
|
||||
});
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
|
||||
])
|
||||
.inc();
|
||||
// END HADRON
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
PageStreamError::Read(_)
|
||||
| PageStreamError::LsnTimeout(_)
|
||||
| PageStreamError::NotFound(_)
|
||||
| PageStreamError::BadRequest(_) => {
|
||||
// BEGIN HADRON
|
||||
if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
|
||||
])
|
||||
.inc();
|
||||
} else {
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
|
||||
])
|
||||
.inc();
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
},
|
||||
Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
|
||||
Ok((response_msg, _op_timer_already_observed, ctx)) => {
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
|
||||
.inc();
|
||||
// END HADRON
|
||||
|
||||
(response_msg, Some(ctx))
|
||||
}
|
||||
};
|
||||
|
||||
let ctx = ctx.map(|req_ctx| {
|
||||
@@ -3293,9 +3338,12 @@ impl GrpcPageServiceHandler {
|
||||
}
|
||||
|
||||
/// Generates a PagestreamRequest header from a ReadLsn and request ID.
|
||||
fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
|
||||
fn make_hdr(
|
||||
read_lsn: page_api::ReadLsn,
|
||||
req_id: Option<page_api::RequestID>,
|
||||
) -> PagestreamRequest {
|
||||
PagestreamRequest {
|
||||
reqid: req_id,
|
||||
reqid: req_id.map(|r| r.id).unwrap_or_default(),
|
||||
request_lsn: read_lsn.request_lsn,
|
||||
not_modified_since: read_lsn
|
||||
.not_modified_since_lsn
|
||||
@@ -3353,6 +3401,8 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
@@ -3403,7 +3453,7 @@ impl GrpcPageServiceHandler {
|
||||
|
||||
batch.push(BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, req.request_id),
|
||||
hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
|
||||
rel: req.rel,
|
||||
blkno,
|
||||
},
|
||||
@@ -3433,12 +3483,16 @@ impl GrpcPageServiceHandler {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
page_images: Vec::with_capacity(results.len()),
|
||||
rel: req.rel,
|
||||
pages: Vec::with_capacity(results.len()),
|
||||
};
|
||||
|
||||
for result in results {
|
||||
match result {
|
||||
Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
|
||||
Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
|
||||
block_number: r.req.blkno,
|
||||
image: r.page,
|
||||
}),
|
||||
Ok((resp, _, _)) => {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected response: {resp:?}"
|
||||
@@ -3481,7 +3535,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(rel=%req.rel, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamExistsRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
rel: req.rel,
|
||||
};
|
||||
|
||||
@@ -3631,7 +3685,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamDbSizeRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
dbnode: req.db_oid,
|
||||
};
|
||||
|
||||
@@ -3681,7 +3735,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.await?
|
||||
.downgrade();
|
||||
while let Some(req) = reqs.message().await? {
|
||||
let req_id = req.request_id;
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
@@ -3720,7 +3774,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(rel=%req.rel, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamNblocksRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
rel: req.rel,
|
||||
};
|
||||
|
||||
@@ -3753,7 +3807,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamGetSlruSegmentRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
kind: req.kind as u8,
|
||||
segno: req.segno,
|
||||
};
|
||||
|
||||
@@ -25,9 +25,9 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId};
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi_types::{Oid, RepOriginId};
|
||||
use postgres_ffi_types::{Oid, RepOriginId, TimestampTz};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
@@ -3291,7 +3291,7 @@ impl TenantShard {
|
||||
// Ignore this, we likely raced with unarchival.
|
||||
OffloadError::NotArchived => Ok(()),
|
||||
OffloadError::AlreadyInProgress => Ok(()),
|
||||
OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
|
||||
OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
|
||||
// don't break the anyhow chain
|
||||
OffloadError::Other(err) => Err(CompactionError::Other(err)),
|
||||
})?;
|
||||
@@ -3321,16 +3321,13 @@ impl TenantShard {
|
||||
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
match err {
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
if err.is_cancel() {
|
||||
return;
|
||||
}
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
@@ -3396,7 +3393,13 @@ impl TenantShard {
|
||||
.collect_vec();
|
||||
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
// Include a span with the timeline ID. The parent span already has the tenant ID.
|
||||
let span =
|
||||
info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
|
||||
timeline
|
||||
.maybe_freeze_ephemeral_layer()
|
||||
.instrument(span)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4174,6 +4177,15 @@ impl TenantShard {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
// HADRON
|
||||
pub fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf.image_layer_force_creation_period.or(self
|
||||
.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_force_creation_period)
|
||||
}
|
||||
|
||||
pub fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -5713,6 +5725,16 @@ impl TenantShard {
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// HADRON
|
||||
/// Return the visible size of all timelines in this tenant.
|
||||
pub(crate) fn get_visible_size(&self) -> u64 {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.values()
|
||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
|
||||
/// manifest in `Self::remote_tenant_manifest`.
|
||||
///
|
||||
@@ -12800,6 +12822,40 @@ mod tests {
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
pitr_interval: Some(Duration::from_secs(7 * 3600)),
|
||||
image_layer_force_creation_period: Some(Duration::from_secs(3600)),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let harness = TenantHarness::create_custom(
|
||||
"test_get_force_image_creation_lsn",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
|
||||
{
|
||||
let writer = timeline.writer().await;
|
||||
writer.finish_write(Lsn(5000));
|
||||
}
|
||||
|
||||
let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
|
||||
assert_eq!(image_creation_lsn, Lsn(4300));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user