mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 22:20:37 +00:00
Compare commits
77 Commits
problame/r
...
conrad/com
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e0f70a97cc | ||
|
|
dfa055f4be | ||
|
|
a4c76740c0 | ||
|
|
f2e96b2323 | ||
|
|
dee73f0cb4 | ||
|
|
edf51688bc | ||
|
|
4a8f3508f9 | ||
|
|
48052477b4 | ||
|
|
d81353b2d1 | ||
|
|
143500dc4f | ||
|
|
1a5f7ce6ad | ||
|
|
01ccb34118 | ||
|
|
f669e18477 | ||
|
|
632cde7f13 | ||
|
|
118e13438d | ||
|
|
fc136eec8f | ||
|
|
818e5130f1 | ||
|
|
c243521ae5 | ||
|
|
5303c71589 | ||
|
|
d146897415 | ||
|
|
d63815fa40 | ||
|
|
385324ee8a | ||
|
|
8a68d463f6 | ||
|
|
3046c307da | ||
|
|
e83f1d8ba5 | ||
|
|
8917676e86 | ||
|
|
43acabd4c2 | ||
|
|
db24ba95d1 | ||
|
|
1dce65308d | ||
|
|
ad88ec9257 | ||
|
|
60dfdf39c7 | ||
|
|
3d5e2bf685 | ||
|
|
54fdcfdfa8 | ||
|
|
28e882a80f | ||
|
|
24038033bf | ||
|
|
1b935b1958 | ||
|
|
3f16ca2c18 | ||
|
|
67b94c5992 | ||
|
|
e38193c530 | ||
|
|
21949137ed | ||
|
|
02f94edb60 | ||
|
|
58327ef74d | ||
|
|
73be6bb736 | ||
|
|
40d7583906 | ||
|
|
7a68699abb | ||
|
|
f42d44342d | ||
|
|
d759fcb8bd | ||
|
|
76f95f06d8 | ||
|
|
7efd4554ab | ||
|
|
3c7235669a | ||
|
|
6dd84041a1 | ||
|
|
df7e301a54 | ||
|
|
470c7d5e0e | ||
|
|
4d99b6ff4d | ||
|
|
590301df08 | ||
|
|
c511786548 | ||
|
|
fe31baf985 | ||
|
|
b23e75ebfe | ||
|
|
24d7c37e6e | ||
|
|
f64eb0cbaf | ||
|
|
6ae4b89000 | ||
|
|
f7ec7668a2 | ||
|
|
038e967daf | ||
|
|
6a43f23eca | ||
|
|
868f194a3b | ||
|
|
9c6c780201 | ||
|
|
6123fe2d5e | ||
|
|
1577665c20 | ||
|
|
d8ebd1d771 | ||
|
|
c8a96cf722 | ||
|
|
56d505bce6 | ||
|
|
dae203ef69 | ||
|
|
1fb1315aed | ||
|
|
838622c594 | ||
|
|
3fd5a94a85 | ||
|
|
e7d6f525b3 | ||
|
|
e4ca3ac745 |
11
.github/workflows/_build-and-test-locally.yml
vendored
11
.github/workflows/_build-and-test-locally.yml
vendored
@@ -38,6 +38,11 @@ on:
|
||||
required: false
|
||||
default: 1
|
||||
type: number
|
||||
rerun-failed:
|
||||
description: 'rerun failed tests to ignore flaky tests'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -379,7 +384,7 @@ jobs:
|
||||
- name: Pytest regression tests
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
|
||||
timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }}
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
@@ -387,14 +392,14 @@ jobs:
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_failed: ${{ inputs.test-run-count == 1 }}
|
||||
rerun_failed: ${{ inputs.rerun-failed }}
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
sanitizers: ${{ inputs.sanitizers }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
|
||||
# Attempt to stop tests gracefully to generate test reports
|
||||
# until they are forcibly stopped by the stricter `timeout-minutes` limit.
|
||||
extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
|
||||
extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
|
||||
${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
@@ -58,6 +58,7 @@ jobs:
|
||||
test-cfg: ${{ inputs.pg-versions }}
|
||||
test-selection: ${{ inputs.test-selection }}
|
||||
test-run-count: ${{ fromJson(inputs.run-count) }}
|
||||
rerun-failed: false
|
||||
secrets: inherit
|
||||
|
||||
create-test-report:
|
||||
|
||||
22
.github/workflows/build_and_test.yml
vendored
22
.github/workflows/build_and_test.yml
vendored
@@ -199,6 +199,28 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
validate-compute-manifest:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
||||
with:
|
||||
node-version: '24'
|
||||
|
||||
- name: Validate manifest against schema
|
||||
run: |
|
||||
make -C compute manifest-schema-validation
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ meta, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
|
||||
151
.github/workflows/build_and_test_fully.yml
vendored
Normal file
151
.github/workflows/build_and_test_fully.yml
vendored
Normal file
@@ -0,0 +1,151 @@
|
||||
name: Build and Test Fully
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
echo run:$GITHUB_RUN_ID
|
||||
echo ref:$GITHUB_REF_NAME
|
||||
echo rev:$(git rev-list --count HEAD)
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-build-tools-image:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
build-type: [ debug, release ]
|
||||
uses: ./.github/workflows/_build-and-test-locally.yml
|
||||
with:
|
||||
arch: ${{ matrix.arch }}
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
rerun-failed: false
|
||||
test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v14", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "withouts-lfc"}]'
|
||||
secrets: inherit
|
||||
|
||||
|
||||
create-test-report:
|
||||
needs: [ build-and-test-locally, build-build-tools-image ]
|
||||
if: ${{ !cancelled() }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const report = {
|
||||
reportUrl: "${{ steps.create-allure-report.outputs.report-url }}",
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
|
||||
const coverage = {}
|
||||
|
||||
const script = require("./scripts/comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
report,
|
||||
coverage,
|
||||
})
|
||||
@@ -79,6 +79,7 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
rerun-failed: false
|
||||
test-cfg: '[{"pg_version":"v17"}]'
|
||||
sanitizers: enabled
|
||||
secrets: inherit
|
||||
|
||||
46
Cargo.lock
generated
46
Cargo.lock
generated
@@ -753,6 +753,7 @@ dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
"futures-util",
|
||||
"headers",
|
||||
"http 1.1.0",
|
||||
@@ -761,6 +762,8 @@ dependencies = [
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"serde_html_form",
|
||||
"serde_path_to_error",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -900,12 +903,6 @@ version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.7"
|
||||
@@ -1297,7 +1294,7 @@ dependencies = [
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"cfg-if",
|
||||
@@ -1423,7 +1420,7 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.13.1",
|
||||
"base64 0.22.1",
|
||||
"camino",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
@@ -1445,6 +1442,7 @@ dependencies = [
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"safekeeper_client",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2054,6 +2052,7 @@ dependencies = [
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"clap",
|
||||
"futures",
|
||||
"http-body-util",
|
||||
"itertools 0.10.5",
|
||||
@@ -4237,6 +4236,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -4812,7 +4812,7 @@ dependencies = [
|
||||
name = "postgres-protocol2"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -5184,7 +5184,7 @@ dependencies = [
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.13.1",
|
||||
"base64 0.22.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -6419,6 +6419,19 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_html_form"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"indexmap 2.9.0",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.125"
|
||||
@@ -6475,15 +6488,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_with"
|
||||
version = "2.3.3"
|
||||
version = "3.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
|
||||
checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa"
|
||||
dependencies = [
|
||||
"base64 0.13.1",
|
||||
"base64 0.22.1",
|
||||
"chrono",
|
||||
"hex",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.9.0",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"serde_with_macros",
|
||||
"time",
|
||||
@@ -6491,9 +6506,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_with_macros"
|
||||
version = "2.3.3"
|
||||
version = "3.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f"
|
||||
checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
@@ -8564,7 +8579,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
|
||||
@@ -71,8 +71,8 @@ aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.8.1", features = ["ws"] }
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header"] }
|
||||
base64 = "0.13.0"
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header", "query"] }
|
||||
base64 = "0.22"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.71"
|
||||
bit_field = "0.10.2"
|
||||
@@ -171,7 +171,7 @@ sentry = { version = "0.37", default-features = false, features = ["backtrace",
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_path_to_error = "0.1"
|
||||
serde_with = { version = "2.0", features = [ "base64" ] }
|
||||
serde_with = { version = "3", features = [ "base64" ] }
|
||||
serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
|
||||
13
Dockerfile
13
Dockerfile
@@ -110,6 +110,19 @@ RUN set -e \
|
||||
# System postgres for use with client libraries (e.g. in storage controller)
|
||||
postgresql-15 \
|
||||
openssl \
|
||||
unzip \
|
||||
curl \
|
||||
&& ARCH=$(uname -m) \
|
||||
&& if [ "$ARCH" = "x86_64" ]; then \
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
|
||||
elif [ "$ARCH" = "aarch64" ]; then \
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $ARCH" && exit 1; \
|
||||
fi \
|
||||
&& unzip awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm -rf aws awscliv2.zip \
|
||||
&& rm -f /etc/apt/apt.conf.d/80-retries \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data neon \
|
||||
|
||||
3
compute/.gitignore
vendored
3
compute/.gitignore
vendored
@@ -3,3 +3,6 @@ etc/neon_collector.yml
|
||||
etc/neon_collector_autoscaling.yml
|
||||
etc/sql_exporter.yml
|
||||
etc/sql_exporter_autoscaling.yml
|
||||
|
||||
# Node.js dependencies
|
||||
node_modules/
|
||||
|
||||
@@ -48,3 +48,11 @@ jsonnetfmt-test:
|
||||
.PHONY: jsonnetfmt-format
|
||||
jsonnetfmt-format:
|
||||
jsonnetfmt --in-place $(jsonnet_files)
|
||||
|
||||
.PHONY: manifest-schema-validation
|
||||
manifest-schema-validation: node_modules
|
||||
node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
|
||||
|
||||
node_modules: package.json
|
||||
npm install
|
||||
touch node_modules
|
||||
|
||||
@@ -603,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
|
||||
echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
|
||||
echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
|
||||
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS online_advisor-build
|
||||
@@ -1722,11 +1722,29 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
|
||||
# Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools-plan
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
WORKDIR /home/nonroot
|
||||
USER nonroot
|
||||
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
|
||||
COPY --from=compute-tools-plan /home/nonroot/recipe.json recipe.json
|
||||
RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
|
||||
--mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
|
||||
--mount=type=cache,uid=1000,target=/home/nonroot/target \
|
||||
mold -run cargo chef cook --locked --profile release-line-debug-size-lto --recipe-path recipe.json
|
||||
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
|
||||
@@ -1902,6 +1920,7 @@ COPY compute/patches/pg_repack.patch /ext-src
|
||||
RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
|
||||
209
compute/manifest.schema.json
Normal file
209
compute/manifest.schema.json
Normal file
@@ -0,0 +1,209 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "Neon Compute Manifest Schema",
|
||||
"description": "Schema for Neon compute node configuration manifest",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pg_settings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"common": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"client_connection_check_interval": {
|
||||
"type": "string",
|
||||
"description": "Check for client disconnection interval in milliseconds"
|
||||
},
|
||||
"effective_io_concurrency": {
|
||||
"type": "string",
|
||||
"description": "Effective IO concurrency setting"
|
||||
},
|
||||
"fsync": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to force fsync to disk"
|
||||
},
|
||||
"hot_standby": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether hot standby is enabled"
|
||||
},
|
||||
"idle_in_transaction_session_timeout": {
|
||||
"type": "string",
|
||||
"description": "Timeout for idle transactions in milliseconds"
|
||||
},
|
||||
"listen_addresses": {
|
||||
"type": "string",
|
||||
"description": "Addresses to listen on"
|
||||
},
|
||||
"log_connections": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log connections"
|
||||
},
|
||||
"log_disconnections": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log disconnections"
|
||||
},
|
||||
"log_temp_files": {
|
||||
"type": "string",
|
||||
"description": "Size threshold for logging temporary files in KB"
|
||||
},
|
||||
"log_error_verbosity": {
|
||||
"type": "string",
|
||||
"enum": ["terse", "verbose", "default"],
|
||||
"description": "Error logging verbosity level"
|
||||
},
|
||||
"log_min_error_statement": {
|
||||
"type": "string",
|
||||
"description": "Minimum error level for statement logging"
|
||||
},
|
||||
"maintenance_io_concurrency": {
|
||||
"type": "string",
|
||||
"description": "Maintenance IO concurrency setting"
|
||||
},
|
||||
"max_connections": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of connections"
|
||||
},
|
||||
"max_replication_flush_lag": {
|
||||
"type": "string",
|
||||
"description": "Maximum replication flush lag"
|
||||
},
|
||||
"max_replication_slots": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of replication slots"
|
||||
},
|
||||
"max_replication_write_lag": {
|
||||
"type": "string",
|
||||
"description": "Maximum replication write lag"
|
||||
},
|
||||
"max_wal_senders": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of WAL senders"
|
||||
},
|
||||
"max_wal_size": {
|
||||
"type": "string",
|
||||
"description": "Maximum WAL size"
|
||||
},
|
||||
"neon.unstable_extensions": {
|
||||
"type": "string",
|
||||
"description": "List of unstable extensions"
|
||||
},
|
||||
"neon.protocol_version": {
|
||||
"type": "string",
|
||||
"description": "Neon protocol version"
|
||||
},
|
||||
"password_encryption": {
|
||||
"type": "string",
|
||||
"description": "Password encryption method"
|
||||
},
|
||||
"restart_after_crash": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to restart after crash"
|
||||
},
|
||||
"superuser_reserved_connections": {
|
||||
"type": "string",
|
||||
"description": "Number of reserved connections for superuser"
|
||||
},
|
||||
"synchronous_standby_names": {
|
||||
"type": "string",
|
||||
"description": "Names of synchronous standby servers"
|
||||
},
|
||||
"wal_keep_size": {
|
||||
"type": "string",
|
||||
"description": "WAL keep size"
|
||||
},
|
||||
"wal_level": {
|
||||
"type": "string",
|
||||
"description": "WAL level"
|
||||
},
|
||||
"wal_log_hints": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log hints in WAL"
|
||||
},
|
||||
"wal_sender_timeout": {
|
||||
"type": "string",
|
||||
"description": "WAL sender timeout in milliseconds"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"client_connection_check_interval",
|
||||
"effective_io_concurrency",
|
||||
"fsync",
|
||||
"hot_standby",
|
||||
"idle_in_transaction_session_timeout",
|
||||
"listen_addresses",
|
||||
"log_connections",
|
||||
"log_disconnections",
|
||||
"log_temp_files",
|
||||
"log_error_verbosity",
|
||||
"log_min_error_statement",
|
||||
"maintenance_io_concurrency",
|
||||
"max_connections",
|
||||
"max_replication_flush_lag",
|
||||
"max_replication_slots",
|
||||
"max_replication_write_lag",
|
||||
"max_wal_senders",
|
||||
"max_wal_size",
|
||||
"neon.unstable_extensions",
|
||||
"neon.protocol_version",
|
||||
"password_encryption",
|
||||
"restart_after_crash",
|
||||
"superuser_reserved_connections",
|
||||
"synchronous_standby_names",
|
||||
"wal_keep_size",
|
||||
"wal_level",
|
||||
"wal_log_hints",
|
||||
"wal_sender_timeout"
|
||||
]
|
||||
},
|
||||
"replica": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hot_standby": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether hot standby is enabled for replicas"
|
||||
}
|
||||
},
|
||||
"required": ["hot_standby"]
|
||||
},
|
||||
"per_version": {
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"^1[4-7]$": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"common": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"io_combine_limit": {
|
||||
"type": "string",
|
||||
"description": "IO combine limit"
|
||||
}
|
||||
}
|
||||
},
|
||||
"replica": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"recovery_prefetch": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to enable recovery prefetch for PostgreSQL replicas"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["common", "replica", "per_version"]
|
||||
}
|
||||
},
|
||||
"required": ["pg_settings"]
|
||||
}
|
||||
121
compute/manifest.yaml
Normal file
121
compute/manifest.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
pg_settings:
|
||||
# Common settings for primaries and replicas of all versions.
|
||||
common:
|
||||
# Check for client disconnection every 1 minute. By default, Postgres will detect the
|
||||
# loss of the connection only at the next interaction with the socket, when it waits
|
||||
# for, receives or sends data, so it will likely waste resources till the end of the
|
||||
# query execution. There should be no drawbacks in setting this for everyone, so enable
|
||||
# it by default. If anyone will complain, we can allow editing it.
|
||||
# https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
|
||||
client_connection_check_interval: "60000" # 1 minute
|
||||
# ---- IO ----
|
||||
effective_io_concurrency: "20"
|
||||
maintenance_io_concurrency: "100"
|
||||
fsync: "off"
|
||||
hot_standby: "off"
|
||||
# We allow users to change this if needed, but by default we
|
||||
# just don't want to see long-lasting idle transactions, as they
|
||||
# prevent activity monitor from suspending projects.
|
||||
idle_in_transaction_session_timeout: "300000" # 5 minutes
|
||||
listen_addresses: "*"
|
||||
# --- LOGGING ---- helps investigations
|
||||
log_connections: "on"
|
||||
log_disconnections: "on"
|
||||
# 1GB, unit is KB
|
||||
log_temp_files: "1048576"
|
||||
# Disable dumping customer data to logs, both to increase data privacy
|
||||
# and to reduce the amount the logs.
|
||||
log_error_verbosity: "terse"
|
||||
log_min_error_statement: "panic"
|
||||
max_connections: "100"
|
||||
# --- WAL ---
|
||||
# - flush lag is the max amount of WAL that has been generated but not yet stored
|
||||
# to disk in the page server. A smaller value means less delay after a pageserver
|
||||
# restart, but if you set it too small you might again need to slow down writes if the
|
||||
# pageserver cannot flush incoming WAL to disk fast enough. This must be larger
|
||||
# than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
|
||||
# a deadlock where the compute node refuses to generate more WAL before the
|
||||
# old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
|
||||
# to be generated before it is uploaded to S3.
|
||||
max_replication_flush_lag: "10GB"
|
||||
max_replication_slots: "10"
|
||||
# Backpressure configuration:
|
||||
# - write lag is the max amount of WAL that has been generated by Postgres but not yet
|
||||
# processed by the page server. Making this smaller reduces the worst case latency
|
||||
# of a GetPage request, if you request a page that was recently modified. On the other
|
||||
# hand, if this is too small, the compute node might need to wait on a write if there is a
|
||||
# hiccup in the network or page server so that the page server has temporarily fallen
|
||||
# behind.
|
||||
#
|
||||
# Previously it was set to 500 MB, but it caused compute being unresponsive under load
|
||||
# https://github.com/neondatabase/neon/issues/2028
|
||||
max_replication_write_lag: "500MB"
|
||||
max_wal_senders: "10"
|
||||
# A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
|
||||
# of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
|
||||
# However, as long as we have full_page_writes=on, page updates after a checkpoint
|
||||
# include full-page images which bloats the WAL. So may want to bump max_wal_size to
|
||||
# reduce the WAL bloating, but at the same it will increase pg_wal directory size on
|
||||
# compute and can lead to out of disk error on k8s nodes.
|
||||
max_wal_size: "1024"
|
||||
wal_keep_size: "0"
|
||||
wal_level: "replica"
|
||||
# Reduce amount of WAL generated by default.
|
||||
wal_log_hints: "off"
|
||||
# - without wal_sender_timeout set we don't get feedback messages,
|
||||
# required for backpressure.
|
||||
wal_sender_timeout: "10000"
|
||||
# We have some experimental extensions, which we don't want users to install unconsciously.
|
||||
# To install them, users would need to set the `neon.allow_unstable_extensions` setting.
|
||||
# There are two of them currently:
|
||||
# - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
|
||||
# and two dependencies:
|
||||
# - `rag_bge_small_en_v15`
|
||||
# - `rag_jina_reranker_v1_tiny_en`
|
||||
# - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/
|
||||
neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
|
||||
neon.protocol_version: "3"
|
||||
password_encryption: "scram-sha-256"
|
||||
# This is important to prevent Postgres from trying to perform
|
||||
# a local WAL redo after backend crash. It should exit and let
|
||||
# the systemd or k8s to do a fresh startup with compute_ctl.
|
||||
restart_after_crash: "off"
|
||||
# By default 3. We have the following persistent connections in the VM:
|
||||
# * compute_activity_monitor (from compute_ctl)
|
||||
# * postgres-exporter (metrics collector; it has 2 connections)
|
||||
# * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
|
||||
# * vm-monitor (to query & change file cache size)
|
||||
# i.e. total of 6. Let's reserve 7, so there's still at least one left over.
|
||||
superuser_reserved_connections: "7"
|
||||
synchronous_standby_names: "walproposer"
|
||||
|
||||
replica:
|
||||
hot_standby: "on"
|
||||
|
||||
per_version:
|
||||
17:
|
||||
common:
|
||||
# PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
|
||||
# size. It still has some issues with readahead, though, so we default to disabled/
|
||||
# "no combining of IOs" to make sure we get the maximum prefetch depth.
|
||||
# See also: https://github.com/neondatabase/neon/pull/9860
|
||||
io_combine_limit: "1"
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
16:
|
||||
common: {}
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
15:
|
||||
common: {}
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
14:
|
||||
common: {}
|
||||
replica: {}
|
||||
37
compute/package-lock.json
generated
Normal file
37
compute/package-lock.json
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"name": "neon-compute",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "neon-compute",
|
||||
"dependencies": {
|
||||
"@sourcemeta/jsonschema": "9.3.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@sourcemeta/jsonschema": {
|
||||
"version": "9.3.4",
|
||||
"resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-9.3.4.tgz",
|
||||
"integrity": "sha512-hkujfkZAIGXUs4U//We9faZW8LZ4/H9LqagRYsFSulH/VLcKPNhZyCTGg7AhORuzm27zqENvKpnX4g2FzudYFw==",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
],
|
||||
"license": "AGPL-3.0",
|
||||
"os": [
|
||||
"darwin",
|
||||
"linux",
|
||||
"win32"
|
||||
],
|
||||
"bin": {
|
||||
"jsonschema": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sourcemeta"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
7
compute/package.json
Normal file
7
compute/package.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name": "neon-compute",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@sourcemeta/jsonschema": "9.3.4"
|
||||
}
|
||||
}
|
||||
@@ -40,7 +40,7 @@ use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
@@ -57,14 +57,14 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
/// The base URL for the remote extension storage proxy gateway.
|
||||
#[arg(short = 'r', long)]
|
||||
#[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
@@ -126,6 +126,25 @@ struct Cli {
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
}
|
||||
|
||||
impl Cli {
|
||||
/// Parse a URL from an argument. By default, this isn't necessary, but we
|
||||
/// want to do some sanity checking.
|
||||
fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
|
||||
// Remove extra trailing slashes, and add one. We use Url::join() later
|
||||
// when downloading remote extensions. If the base URL is something like
|
||||
// http://example.com/pg-ext-s3-gateway, and join() is called with
|
||||
// something like "xyz", the resulting URL is http://example.com/xyz.
|
||||
let value = value.trim_end_matches('/').to_owned() + "/";
|
||||
let url = Url::parse(&value)?;
|
||||
|
||||
if url.query_pairs().count() != 0 {
|
||||
bail!("parameters detected in remote extensions base URL")
|
||||
}
|
||||
|
||||
Ok(url)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use clap::CommandFactory;
|
||||
use clap::{CommandFactory, Parser};
|
||||
use url::Url;
|
||||
|
||||
use super::Cli;
|
||||
|
||||
@@ -260,4 +280,43 @@ mod test {
|
||||
fn verify_cli() {
|
||||
Cli::command().debug_assert()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_remote_ext_base_url() {
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com/subpath",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com/subpath/").unwrap()
|
||||
);
|
||||
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com//",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com").unwrap()
|
||||
);
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com?hello=world",
|
||||
])
|
||||
.expect_err("URL parameters are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
@@ -354,11 +354,6 @@ impl ComputeNode {
|
||||
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
|
||||
// Unset them via connection string options before connecting to the database.
|
||||
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
|
||||
//
|
||||
// TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
|
||||
// as well. After rolling out this code, we can remove this parameter from control plane.
|
||||
// In the meantime, double-passing is fine, the last value is applied.
|
||||
// See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
let options = match conn_conf.get_options() {
|
||||
Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
|
||||
@@ -603,6 +598,8 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// If there are any remote extensions in shared_preload_libraries, start downloading them
|
||||
if pspec.spec.remote_extensions.is_some() {
|
||||
let (this, spec) = (self.clone(), pspec.spec.clone());
|
||||
@@ -659,7 +656,7 @@ impl ComputeNode {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -678,7 +675,10 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
let _handle = tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -780,7 +780,7 @@ impl ComputeNode {
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.autoprewarm {
|
||||
self.prewarm_lfc();
|
||||
self.prewarm_lfc(None);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1205,13 +1205,15 @@ impl ComputeNode {
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&pspec.spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1536,14 +1538,22 @@ impl ComputeNode {
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let mut tls_config = None::<TlsConfig>;
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
|
||||
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
|
||||
|
||||
// Merge-apply spec & changes to PostgreSQL state.
|
||||
self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
|
||||
|
||||
if let Some(local_proxy) = &spec.clone().local_proxy_config {
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
info!("configuring local_proxy");
|
||||
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
|
||||
local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
@@ -1595,11 +1605,13 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -1617,7 +1629,7 @@ impl ComputeNode {
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = self.compute_ctl_config.tls.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -1635,7 +1647,7 @@ impl ComputeNode {
|
||||
pgdata_path,
|
||||
&spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
@@ -1755,6 +1767,14 @@ impl ComputeNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
&self.compute_ctl_config.tls
|
||||
} else {
|
||||
&None::<TlsConfig>
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
@@ -25,11 +25,16 @@ struct EndpointStoragePair {
|
||||
}
|
||||
|
||||
const KEY: &str = "lfc_state";
|
||||
impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(pspec: &crate::compute::ParsedSpec) -> Result<Self, Self::Error> {
|
||||
let Some(ref endpoint_id) = pspec.spec.endpoint_id else {
|
||||
bail!("pspec.endpoint_id missing")
|
||||
impl EndpointStoragePair {
|
||||
/// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
|
||||
/// If not None, takes precedence over pspec.spec.endpoint_id
|
||||
fn from_spec_and_endpoint(
|
||||
pspec: &crate::compute::ParsedSpec,
|
||||
endpoint_id: Option<String>,
|
||||
) -> Result<Self> {
|
||||
let endpoint_id = endpoint_id.as_ref().or(pspec.spec.endpoint_id.as_ref());
|
||||
let Some(ref endpoint_id) = endpoint_id else {
|
||||
bail!("pspec.endpoint_id missing, other endpoint_id not provided")
|
||||
};
|
||||
let Some(ref base_uri) = pspec.endpoint_storage_addr else {
|
||||
bail!("pspec.endpoint_storage_addr missing")
|
||||
@@ -84,7 +89,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Returns false if there is a prewarm request ongoing, true otherwise
|
||||
pub fn prewarm_lfc(self: &Arc<Self>) -> bool {
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
crate::metrics::LFC_PREWARM_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
@@ -97,7 +102,7 @@ impl ComputeNode {
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.prewarm_impl().await else {
|
||||
let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
@@ -109,13 +114,14 @@ impl ComputeNode {
|
||||
true
|
||||
}
|
||||
|
||||
fn endpoint_storage_pair(&self) -> Result<EndpointStoragePair> {
|
||||
/// from_endpoint: None for endpoint managed by this compute_ctl
|
||||
fn endpoint_storage_pair(&self, from_endpoint: Option<String>) -> Result<EndpointStoragePair> {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.pspec.as_ref().unwrap().try_into()
|
||||
EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
|
||||
}
|
||||
|
||||
async fn prewarm_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
@@ -173,7 +179,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from postgres");
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
|
||||
@@ -166,7 +166,7 @@ pub async fn download_extension(
|
||||
|
||||
// TODO add retry logic
|
||||
let download_buffer =
|
||||
match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
|
||||
match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
|
||||
Ok(buffer) => buffer,
|
||||
Err(error_message) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
@@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", remote_ext_base_url, ext_path);
|
||||
async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = remote_ext_base_url.join(ext_path).with_context(|| {
|
||||
format!(
|
||||
"failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
|
||||
)
|
||||
})?;
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
@@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
match do_extension_server_request(uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
@@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
// Do a single remote extensions server request.
|
||||
// Return result or (error message + stringified status code) in case of any failures.
|
||||
async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
|
||||
async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
|
||||
let resp = reqwest::get(uri).await.map_err(|e| {
|
||||
(
|
||||
format!(
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
let pspec = match ParsedSpec::try_from(request.0.spec) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::compute_prewarm::LfcPrewarmStateWithProgress;
|
||||
use crate::http::JsonResponse;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Json, http::StatusCode};
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::LfcOffloadState;
|
||||
type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
|
||||
|
||||
@@ -16,8 +17,16 @@ pub(in crate::http) async fn offload_state(compute: Compute) -> Json<LfcOffloadS
|
||||
Json(compute.lfc_offload_state())
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn prewarm(compute: Compute) -> Response {
|
||||
if compute.prewarm_lfc() {
|
||||
#[derive(serde::Deserialize)]
|
||||
pub struct PrewarmQuery {
|
||||
pub from_endpoint: String,
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn prewarm(
|
||||
compute: Compute,
|
||||
OptionalQuery(query): OptionalQuery<PrewarmQuery>,
|
||||
) -> Response {
|
||||
if compute.prewarm_lfc(query.map(|q| q.from_endpoint)) {
|
||||
StatusCode::ACCEPTED.into_response()
|
||||
} else {
|
||||
JsonResponse::error(
|
||||
|
||||
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
/// Struct to store runtime state of the compute monitor thread.
|
||||
/// In theory, this could be a part of `Compute`, but i)
|
||||
/// this state is expected to be accessed only by single thread,
|
||||
/// so we don't need to care about locking; ii) `Compute` is
|
||||
/// already quite big. Thus, it seems to be a good idea to keep
|
||||
/// all the activity/health monitoring parts here.
|
||||
struct ComputeMonitor {
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
@@ -70,12 +76,36 @@ impl ComputeMonitor {
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if compute is in some terminal or soon-to-be-terminal
|
||||
/// state, then return `true`, signalling the caller that it
|
||||
/// should exit gracefully. Otherwise, return `false`.
|
||||
fn check_interrupts(&mut self) -> bool {
|
||||
let compute_status = self.compute.get_status();
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
"compute is in {} status, stopping compute monitor",
|
||||
compute_status
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
/// Then update it in the shared state. This function never errors out.
|
||||
/// Then update it in the shared state. This function currently never
|
||||
/// errors out explicitly, but there is a graceful termination path.
|
||||
/// Every time we receive an error trying to check Postgres, we use
|
||||
/// [`ComputeMonitor::check_interrupts()`] because it could be that
|
||||
/// compute is being terminated already, then we can exit gracefully
|
||||
/// to not produce errors' noise in the log.
|
||||
/// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
||||
/// should be handled gracefully.
|
||||
#[instrument(skip_all)]
|
||||
pub fn run(&mut self) {
|
||||
pub fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = self.compute.params.connstr.clone();
|
||||
let conf = self
|
||||
@@ -93,6 +123,10 @@ impl ComputeMonitor {
|
||||
info!("starting compute monitor for {}", connstr);
|
||||
|
||||
loop {
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
match &mut client {
|
||||
Ok(cli) => {
|
||||
if cli.is_closed() {
|
||||
@@ -100,6 +134,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"connection to Postgres is closed, trying to reconnect"
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
@@ -111,15 +149,19 @@ impl ComputeMonitor {
|
||||
self.compute.update_last_active(self.last_active);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Although we have many places where we can return errors in `check()`,
|
||||
// normally it shouldn't happen. I.e., we will likely return error if
|
||||
// connection got broken, query timed out, Postgres returned invalid data, etc.
|
||||
// In all such cases it's suspicious, so let's report this as downtime.
|
||||
self.report_down();
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
|
||||
// Reconnect to Postgres just in case. During tests, I noticed
|
||||
// that queries in `check()` can fail with `connection closed`,
|
||||
@@ -136,6 +178,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not connect to Postgres: {}, retrying", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Establish a new connection and try again.
|
||||
@@ -147,6 +193,9 @@ impl ComputeMonitor {
|
||||
self.last_checked = Utc::now();
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
}
|
||||
|
||||
// Graceful termination path
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
match monitor.run() {
|
||||
Ok(_) => info!("compute monitor thread terminated gracefully"),
|
||||
Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
|
||||
}
|
||||
})
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
http-utils.workspace = true
|
||||
|
||||
@@ -18,7 +18,7 @@ use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -45,7 +45,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
@@ -605,6 +605,14 @@ struct EndpointCreateCmdArgs {
|
||||
#[clap(long, help = "Postgres version")]
|
||||
pg_version: u32,
|
||||
|
||||
/// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
|
||||
///
|
||||
/// Specified on creation such that it's retained across reconfiguration and restarts.
|
||||
///
|
||||
/// NB: not yet supported by computes.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "If set, the node will be a hot replica on the specified timeline",
|
||||
@@ -1255,6 +1263,45 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version)
|
||||
.await?;
|
||||
if env.storage_controller.timelines_onto_safekeepers {
|
||||
println!("Creating timeline on safekeeper ...");
|
||||
let timeline_info = pageserver
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
pageserver_client::mgmt_api::ForceAwaitLogicalSize::No,
|
||||
)
|
||||
.await?;
|
||||
let default_sk = SafekeeperNode::from_env(env, env.safekeepers.first().unwrap());
|
||||
let default_host = default_sk
|
||||
.conf
|
||||
.listen_addr
|
||||
.clone()
|
||||
.unwrap_or_else(|| "localhost".to_string());
|
||||
let mconf = safekeeper_api::membership::Configuration {
|
||||
generation: SafekeeperGeneration::new(1),
|
||||
members: safekeeper_api::membership::MemberSet {
|
||||
m: vec![SafekeeperId {
|
||||
host: default_host,
|
||||
id: default_sk.conf.id,
|
||||
pg_port: default_sk.conf.pg_port,
|
||||
}],
|
||||
},
|
||||
new_members: None,
|
||||
};
|
||||
let pg_version = args.pg_version * 10000;
|
||||
let req = safekeeper_api::models::TimelineCreateRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf,
|
||||
pg_version,
|
||||
system_id: None,
|
||||
wal_seg_size: None,
|
||||
start_lsn: timeline_info.last_record_lsn,
|
||||
commit_lsn: None,
|
||||
};
|
||||
default_sk.create_timeline(&req).await?;
|
||||
}
|
||||
env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -1412,6 +1459,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
args.internal_http_port,
|
||||
args.pg_version,
|
||||
mode,
|
||||
args.grpc,
|
||||
!args.update_catalog,
|
||||
false,
|
||||
)?;
|
||||
@@ -1452,13 +1500,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
@@ -1477,11 +1532,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.await?;
|
||||
}
|
||||
|
||||
anyhow::Ok((
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
))
|
||||
let pageserver = if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -1536,11 +1600,19 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
||||
vec![(
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
vec![pageserver]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
@@ -1549,11 +1621,21 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
// Use gRPC if requested.
|
||||
if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
@@ -45,6 +46,8 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use base64::Engine;
|
||||
use base64::prelude::BASE64_URL_SAFE_NO_PAD;
|
||||
use compute_api::requests::{
|
||||
COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
|
||||
};
|
||||
@@ -74,7 +77,6 @@ use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -87,6 +89,7 @@ pub struct EndpointConf {
|
||||
external_http_port: u16,
|
||||
internal_http_port: u16,
|
||||
pg_version: u32,
|
||||
grpc: bool,
|
||||
skip_pg_catalog_updates: bool,
|
||||
reconfigure_concurrency: usize,
|
||||
drop_subscriptions_before_start: bool,
|
||||
@@ -164,7 +167,7 @@ impl ComputeControlPlane {
|
||||
public_key_use: Some(PublicKeyUse::Signature),
|
||||
key_operations: Some(vec![KeyOperations::Verify]),
|
||||
key_algorithm: Some(KeyAlgorithm::EdDSA),
|
||||
key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
|
||||
key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)),
|
||||
x509_url: None::<String>,
|
||||
x509_chain: None::<Vec<String>>,
|
||||
x509_sha1_fingerprint: None::<String>,
|
||||
@@ -173,7 +176,7 @@ impl ComputeControlPlane {
|
||||
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
|
||||
key_type: OctetKeyPairType::OctetKeyPair,
|
||||
curve: EllipticCurve::Ed25519,
|
||||
x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
|
||||
x: BASE64_URL_SAFE_NO_PAD.encode(public_key),
|
||||
}),
|
||||
}],
|
||||
})
|
||||
@@ -190,6 +193,7 @@ impl ComputeControlPlane {
|
||||
internal_http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
grpc: bool,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
@@ -224,6 +228,7 @@ impl ComputeControlPlane {
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
grpc,
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
@@ -242,6 +247,7 @@ impl ComputeControlPlane {
|
||||
internal_http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
grpc,
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
reconfigure_concurrency: 1,
|
||||
@@ -296,6 +302,8 @@ pub struct Endpoint {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mode: ComputeMode,
|
||||
/// If true, the endpoint should use gRPC to communicate with Pageservers.
|
||||
pub grpc: bool,
|
||||
|
||||
// port and address of the Postgres server and `compute_ctl`'s HTTP APIs
|
||||
pub pg_address: SocketAddr,
|
||||
@@ -331,7 +339,7 @@ pub enum EndpointStatus {
|
||||
RunningNoPidfile,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EndpointStatus {
|
||||
impl Display for EndpointStatus {
|
||||
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
Self::Running => "running",
|
||||
@@ -343,6 +351,29 @@ impl std::fmt::Display for EndpointStatus {
|
||||
}
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PageserverProtocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
impl PageserverProtocol {
|
||||
/// Returns the URL scheme for the protocol, used in connstrings.
|
||||
pub fn scheme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Libpq => "postgresql",
|
||||
Self::Grpc => "grpc",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PageserverProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.scheme())
|
||||
}
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -378,6 +409,7 @@ impl Endpoint {
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
grpc: conf.grpc,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
reconfigure_concurrency: conf.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
|
||||
@@ -606,10 +638,10 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
@@ -654,7 +686,7 @@ impl Endpoint {
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
@@ -939,10 +971,12 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
@@ -954,25 +988,7 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
if pageservers.is_empty() {
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT};
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -252,9 +253,10 @@ impl PageServerNode {
|
||||
// the storage controller
|
||||
let metadata_path = datadir.join("metadata.json");
|
||||
|
||||
let (_http_host, http_port) =
|
||||
let http_host = "localhost".to_string();
|
||||
let (_, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT);
|
||||
|
||||
let https_port = match self.conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
@@ -265,6 +267,13 @@ impl PageServerNode {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let (mut grpc_host, mut grpc_port) = (None, None);
|
||||
if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
|
||||
let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
|
||||
grpc_host = Some("localhost".to_string());
|
||||
grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT));
|
||||
}
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -273,7 +282,9 @@ impl PageServerNode {
|
||||
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
grpc_host,
|
||||
grpc_port,
|
||||
http_host,
|
||||
http_port,
|
||||
https_port,
|
||||
other: HashMap::from([(
|
||||
@@ -513,11 +524,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'timeline_offloading' as bool")?,
|
||||
wal_receiver_protocol_override: settings
|
||||
.remove("wal_receiver_protocol_override")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `wal_receiver_protocol_override` from json")?,
|
||||
rel_size_v2_enabled: settings
|
||||
.remove("rel_size_v2_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
@@ -640,4 +646,16 @@ impl PageServerNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
pub async fn timeline_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
force_await_logical_size: mgmt_api::ForceAwaitLogicalSize,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let timeline_info = self
|
||||
.http_client
|
||||
.timeline_info(tenant_shard_id, timeline_id, force_await_logical_size)
|
||||
.await?;
|
||||
Ok(timeline_info)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::error::Error as _;
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
@@ -14,9 +13,9 @@ use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use safekeeper_client::mgmt_api;
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
@@ -35,25 +34,14 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>().await {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
fn err_from_client_err(err: mgmt_api::Error) -> SafekeeperHttpError {
|
||||
use mgmt_api::Error::*;
|
||||
match err {
|
||||
ApiError(_, str) => SafekeeperHttpError::Response(str),
|
||||
Cancelled => SafekeeperHttpError::Response("Cancelled".to_owned()),
|
||||
ReceiveBody(err) => SafekeeperHttpError::Transport(err),
|
||||
ReceiveErrorBody(err) => SafekeeperHttpError::Response(err),
|
||||
Timeout(str) => SafekeeperHttpError::Response(format!("timeout: {str}")),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,9 +58,8 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub http_client: mgmt_api::Client,
|
||||
pub listen_addr: String,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
@@ -82,13 +69,14 @@ impl SafekeeperNode {
|
||||
} else {
|
||||
"127.0.0.1".to_string()
|
||||
};
|
||||
let jwt = None;
|
||||
let http_base_url = format!("http://{}:{}", listen_addr, conf.http_port);
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: env.create_http_client(),
|
||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||
http_client: mgmt_api::Client::new(env.create_http_client(), http_base_url, jwt),
|
||||
listen_addr,
|
||||
}
|
||||
}
|
||||
@@ -278,20 +266,19 @@ impl SafekeeperNode {
|
||||
)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::NeonJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
//}
|
||||
self.http_client.request(method, url)
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
self.http_client
|
||||
.status()
|
||||
.await
|
||||
.map_err(err_from_client_err)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()
|
||||
.await?
|
||||
.error_from_body()
|
||||
.await?;
|
||||
pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<()> {
|
||||
self.http_client
|
||||
.create_timeline(req)
|
||||
.await
|
||||
.map_err(err_from_client_err)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,10 @@ enum Command {
|
||||
listen_pg_addr: String,
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
#[arg(long)]
|
||||
listen_grpc_addr: Option<String>,
|
||||
#[arg(long)]
|
||||
listen_grpc_port: Option<u16>,
|
||||
|
||||
#[arg(long)]
|
||||
listen_http_addr: String,
|
||||
@@ -61,10 +65,16 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
// Set a node status as deleted.
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Delete a tombstone of node from the storage controller.
|
||||
NodeDeleteTombstone {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
@@ -82,6 +92,8 @@ enum Command {
|
||||
},
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List soft deleted nodes known to the storage controller
|
||||
NodeTombstones {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {
|
||||
/// If this field is set, it will list the tenants on a specific node
|
||||
@@ -410,6 +422,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
@@ -423,6 +437,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
@@ -900,6 +916,39 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDeleteTombstone { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("debug/v1/tombstone/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeTombstones {} => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"debug/v1/tombstone".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
node.availability_zone_id,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux
|
||||
|
||||
# Generate a random tenant or timeline ID
|
||||
#
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
local -n resvar=${1}
|
||||
printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
|
||||
}
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
readonly CONFIG_FILE=/tmp/config.json
|
||||
|
||||
# Test that the first library path that the dynamic loader looks in is the path
|
||||
# that we use for custom compiled software
|
||||
@@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \
|
||||
| grep --invert-match ^$'\t' \
|
||||
| cut --delimiter=: --fields=1 \
|
||||
| head --lines=1)"
|
||||
test "$first_path" == '/usr/local/lib'
|
||||
test "${first_path}" = '/usr/local/lib'
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
sleep 1
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
timeline_id=${TIMELINE_ID}
|
||||
else
|
||||
@@ -41,7 +41,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant"
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
|
||||
if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
|
||||
if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
|
||||
echo "Create a tenant"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
@@ -51,7 +51,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
@@ -61,7 +61,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
@@ -71,7 +71,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -82,10 +82,10 @@ else
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
|
||||
sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}
|
||||
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
@@ -93,5 +93,5 @@ echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
--config "$CONFIG_FILE"
|
||||
--compute-id "compute-${RANDOM}" \
|
||||
--config "${CONFIG_FILE}"
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname "$0")"
|
||||
if [[ ${PG_VERSION} = v17 ]]; then
|
||||
sed -i '/computed_columns/d' regress/core/tests.mk
|
||||
fi
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-no-upgrade-test.patch' EXIT
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-common-${PG_VERSION}.patch' EXIT
|
||||
make installcheck-base
|
||||
@@ -1,3 +1,19 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..64a9254 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -144,11 +144,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index c051f03..010e493 100644
|
||||
--- a/regress/runtest.mk
|
||||
35
docker-compose/ext-src/postgis-src/postgis-common-v17.patch
Normal file
35
docker-compose/ext-src/postgis-src/postgis-common-v17.patch
Normal file
@@ -0,0 +1,35 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..90987df 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -143,8 +143,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index 4b95b7e..449d5a2 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
@POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
@@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..94903c3 100644
|
||||
index 64a9254..94903c3 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
@@ -160,18 +160,6 @@ index 3abd7bc..94903c3 100644
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -144,11 +140,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index 1fc77ac..c3cb9de 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
|
||||
@@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..a63a3e1 100644
|
||||
index 90987df..74fe3f1 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
|
||||
@@ -168,16 +168,6 @@ index 9e05244..a63a3e1 100644
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -143,8 +139,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index ac4f8ad..4bad4fc 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
|
||||
@@ -10,8 +10,8 @@ psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='U
|
||||
-c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
|
||||
-c "CREATE EXTENSION postgis_raster SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
patch -p1 <"postgis-regular-${PG_VERSION}.patch"
|
||||
psql -d contrib_regression -f raster_outdb_template.sql
|
||||
trap 'patch -R -p1 <postgis-no-upgrade-test.patch && patch -R -p1 <"postgis-regular-${PG_VERSION}.patch"' EXIT
|
||||
trap 'patch -R -p1 <postgis-regular-${PG_VERSION}.patch && patch -R -p1 <"postgis-common-${PG_VERSION}.patch"' EXIT
|
||||
POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
|
||||
@@ -8,6 +8,7 @@ anyhow.workspace = true
|
||||
axum-extra.workspace = true
|
||||
axum.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
//! for large computes.
|
||||
mod app;
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
@@ -12,9 +14,26 @@ const fn max_upload_file_limit() -> usize {
|
||||
100 * 1024 * 1024
|
||||
}
|
||||
|
||||
const fn listen() -> SocketAddr {
|
||||
SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 51243)
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Args {
|
||||
#[arg(exclusive = true)]
|
||||
config_file: Option<String>,
|
||||
#[arg(long, default_value = "false", requires = "config")]
|
||||
/// to allow testing k8s helm chart where we don't have s3 credentials
|
||||
no_s3_check_on_startup: bool,
|
||||
#[arg(long, value_name = "FILE")]
|
||||
/// inline config mode for k8s helm chart
|
||||
config: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
struct Config {
|
||||
#[serde(default = "listen")]
|
||||
listen: std::net::SocketAddr,
|
||||
pemfile: camino::Utf8PathBuf,
|
||||
#[serde(flatten)]
|
||||
@@ -31,13 +50,18 @@ async fn main() -> anyhow::Result<()> {
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let config: String = std::env::args().skip(1).take(1).collect();
|
||||
if config.is_empty() {
|
||||
anyhow::bail!("Usage: endpoint_storage config.json")
|
||||
}
|
||||
info!("Reading config from {config}");
|
||||
let config = std::fs::read_to_string(config.clone())?;
|
||||
let config: Config = serde_json::from_str(&config).context("parsing config")?;
|
||||
let args = Args::parse();
|
||||
let config: Config = if let Some(config_path) = args.config_file {
|
||||
info!("Reading config from {config_path}");
|
||||
let config = std::fs::read_to_string(config_path)?;
|
||||
serde_json::from_str(&config).context("parsing config")?
|
||||
} else if let Some(config) = args.config {
|
||||
info!("Reading inline config");
|
||||
serde_json::from_str(&config).context("parsing config")?
|
||||
} else {
|
||||
anyhow::bail!("Supply either config file path or --config=inline-config");
|
||||
};
|
||||
|
||||
info!("Reading pemfile from {}", config.pemfile.clone());
|
||||
let pemfile = std::fs::read(config.pemfile.clone())?;
|
||||
info!("Loading public key from {}", config.pemfile.clone());
|
||||
@@ -48,7 +72,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
if !args.no_s3_check_on_startup {
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
}
|
||||
|
||||
let proxy = std::sync::Arc::new(endpoint_storage::Storage {
|
||||
auth,
|
||||
|
||||
@@ -192,6 +192,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable TLS functionality.
|
||||
TlsExperimental,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
@@ -250,34 +253,44 @@ impl RemoteExtSpec {
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
Some(_ext_data) => {
|
||||
// We have decided to use the Go naming convention due to Kubernetes.
|
||||
|
||||
let arch = match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
};
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
let archive_path_str = format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
|
||||
);
|
||||
Ok((
|
||||
real_ext_name.to_string(),
|
||||
RemotePath::from_string(&archive_path_str)?,
|
||||
))
|
||||
}
|
||||
Some(_ext_data) => Ok((
|
||||
real_ext_name.to_string(),
|
||||
Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the architecture-specific portion of the remote extension path. We
|
||||
/// use the Go naming convention due to Kubernetes.
|
||||
fn get_arch() -> &'static str {
|
||||
match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a [`RemotePath`] for an extension.
|
||||
fn build_remote_path(
|
||||
build_tag: &str,
|
||||
pg_major_version: &str,
|
||||
ext_name: &str,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let arch = Self::get_arch();
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
RemotePath::from_string(&format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
@@ -518,6 +531,37 @@ mod tests {
|
||||
.expect("Library should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remote_extension_path() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"extlib": "ext",
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let (_ext_name, ext_path) = rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
// Starting with a forward slash would have consequences for the
|
||||
// Url::join() that occurs when downloading a remote extension.
|
||||
assert!(!ext_path.to_string().starts_with("/"));
|
||||
assert_eq!(
|
||||
ext_path,
|
||||
RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -12,6 +12,7 @@ pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LI
|
||||
pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::num::{NonZeroU64, NonZeroUsize};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
@@ -20,21 +21,21 @@ use postgres_backend::AuthType;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use serde_with::serde_as;
|
||||
use utils::logging::LogFormat;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::models::{ImageCompressionAlgorithm, LsnLease};
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// as a separate structure. This information is not needed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
|
||||
pub struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub postgres_port: u16,
|
||||
pub grpc_host: Option<String>,
|
||||
pub grpc_port: Option<u16>,
|
||||
pub http_host: String,
|
||||
pub http_port: u16,
|
||||
pub https_port: Option<u16>,
|
||||
@@ -45,6 +46,23 @@ pub struct NodeMetadata {
|
||||
pub other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl Display for NodeMetadata {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"postgresql://{}:{} ",
|
||||
self.postgres_host, self.postgres_port
|
||||
)?;
|
||||
if let Some(grpc_host) = &self.grpc_host {
|
||||
let grpc_port = self.grpc_port.unwrap_or_default();
|
||||
write!(f, "grpc://{grpc_host}:{grpc_port} ")?;
|
||||
}
|
||||
write!(f, "http://{}:{} ", self.http_host, self.http_port)?;
|
||||
write!(f, "other:{:?}", self.other)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// PostHog integration config.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PostHogConfig {
|
||||
@@ -189,7 +207,6 @@ pub struct ConfigToml {
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub no_sync: Option<bool>,
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
@@ -339,16 +356,21 @@ pub struct TimelineImportConfig {
|
||||
pub struct BasebackupCacheConfig {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub cleanup_period: Duration,
|
||||
// FIXME: Support max_size_bytes.
|
||||
// pub max_size_bytes: usize,
|
||||
pub max_size_entries: i64,
|
||||
/// Maximum total size of basebackup cache entries on disk in bytes.
|
||||
/// The cache may slightly exceed this limit because we do not know
|
||||
/// the exact size of the cache entry untill it's written to disk.
|
||||
pub max_total_size_bytes: u64,
|
||||
// TODO(diko): support max_entry_size_bytes.
|
||||
// pub max_entry_size_bytes: u64,
|
||||
pub max_size_entries: usize,
|
||||
}
|
||||
|
||||
impl Default for BasebackupCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cleanup_period: Duration::from_secs(60),
|
||||
// max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
// max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB
|
||||
max_size_entries: 1000,
|
||||
}
|
||||
}
|
||||
@@ -527,8 +549,6 @@ pub struct TenantConfigToml {
|
||||
/// (either this flag or the pageserver-global one need to be set)
|
||||
pub timeline_offloading: bool,
|
||||
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
|
||||
/// `index_part.json`, and it cannot be reversed.
|
||||
pub rel_size_v2_enabled: bool,
|
||||
@@ -609,12 +629,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
|
||||
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
|
||||
utils::postgres_client::PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
|
||||
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
|
||||
}
|
||||
@@ -713,7 +727,6 @@ impl Default for ConfigToml {
|
||||
virtual_file_io_mode: None,
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
page_service_pipelining: PageServicePipeliningConfig::Pipelined(
|
||||
PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
@@ -858,7 +871,6 @@ impl Default for TenantConfigToml {
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
timeline_offloading: true,
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
|
||||
@@ -14,6 +14,8 @@ fn test_node_metadata_v1_backward_compatibilty() {
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: None,
|
||||
grpc_port: None,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: None,
|
||||
@@ -37,6 +39,35 @@ fn test_node_metadata_v2_backward_compatibilty() {
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: None,
|
||||
grpc_port: None,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: Some(123),
|
||||
other: HashMap::new(),
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_node_metadata_v3_backward_compatibilty() {
|
||||
let v3 = serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": 23,
|
||||
"grpc_host": "localhost",
|
||||
"grpc_port": 51,
|
||||
"http_host": "localhost",
|
||||
"http_port": 42,
|
||||
"https_port": 123,
|
||||
}));
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_slice::<NodeMetadata>(&v3.unwrap()).unwrap(),
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: Some("localhost".to_string()),
|
||||
grpc_port: Some(51),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: Some(123),
|
||||
|
||||
@@ -52,6 +52,8 @@ pub struct NodeRegisterRequest {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
@@ -101,6 +103,8 @@ pub struct TenantLocateResponseShard {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
@@ -152,6 +156,8 @@ pub struct NodeDescribeResponse {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -344,6 +350,35 @@ impl Default for ShardSchedulingPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum NodeLifecycle {
|
||||
Active,
|
||||
Deleted,
|
||||
}
|
||||
|
||||
impl FromStr for NodeLifecycle {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"deleted" => Ok(Self::Deleted),
|
||||
_ => Err(anyhow::anyhow!("Unknown node lifecycle '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeLifecycle> for String {
|
||||
fn from(value: NodeLifecycle) -> String {
|
||||
use NodeLifecycle::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Deleted => "deleted",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
|
||||
@@ -20,7 +20,6 @@ use serde_with::serde_as;
|
||||
pub use utilization::PageserverUtilization;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::{completion, serde_system_time};
|
||||
|
||||
use crate::config::Ratio;
|
||||
@@ -622,8 +621,6 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub timeline_offloading: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub rel_size_v2_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
@@ -748,9 +745,6 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_offloading: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
|
||||
@@ -812,7 +806,6 @@ impl TenantConfig {
|
||||
mut lsn_lease_length,
|
||||
mut lsn_lease_length_for_ts,
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
@@ -905,9 +898,6 @@ impl TenantConfig {
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lsn_lease_length_for_ts);
|
||||
patch.timeline_offloading.apply(&mut timeline_offloading);
|
||||
patch
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
@@ -960,7 +950,6 @@ impl TenantConfig {
|
||||
lsn_lease_length,
|
||||
lsn_lease_length_for_ts,
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
@@ -1058,9 +1047,6 @@ impl TenantConfig {
|
||||
timeline_offloading: self
|
||||
.timeline_offloading
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: self
|
||||
.wal_receiver_protocol_override
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
rel_size_v2_enabled: self
|
||||
.rel_size_v2_enabled
|
||||
.unwrap_or(global_conf.rel_size_v2_enabled),
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::id::{NodeId, TimelineId};
|
||||
|
||||
use crate::controller_api::NodeRegisterRequest;
|
||||
use crate::models::{LocationConfigMode, ShardImportStatus};
|
||||
use crate::shard::TenantShardId;
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
@@ -36,6 +36,10 @@ pub struct ReAttachResponseTenant {
|
||||
/// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "default_mode")]
|
||||
pub mode: LocationConfigMode,
|
||||
|
||||
// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "ShardStripeSize::default")]
|
||||
pub stripe_size: ShardStripeSize,
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
|
||||
@@ -6,7 +6,7 @@ use arc_swap::ArcSwap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, info_span};
|
||||
|
||||
use crate::{FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
|
||||
/// A background loop that fetches feature flags from PostHog and updates the feature store.
|
||||
pub struct FeatureResolverBackgroundLoop {
|
||||
@@ -24,9 +24,16 @@ impl FeatureResolverBackgroundLoop {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(self: Arc<Self>, handle: &tokio::runtime::Handle, refresh_period: Duration) {
|
||||
pub fn spawn(
|
||||
self: Arc<Self>,
|
||||
handle: &tokio::runtime::Handle,
|
||||
refresh_period: Duration,
|
||||
fake_tenants: Vec<CaptureEvent>,
|
||||
) {
|
||||
let this = self.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Main loop of updating the feature flags.
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature resolver");
|
||||
@@ -48,14 +55,37 @@ impl FeatureResolverBackgroundLoop {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let feature_store = FeatureStore::new_with_flags(resp.flags);
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
|
||||
match FeatureStore::new_with_flags(resp.flags, project_id) {
|
||||
Ok(feature_store) => {
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot process feature flag spec: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
}
|
||||
.instrument(info_span!("posthog_feature_resolver")),
|
||||
);
|
||||
|
||||
// Report fake tenants to PostHog so that we have the combination of all the properties in the UI.
|
||||
// Do one report per pageserver restart.
|
||||
let this = self.clone();
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature reporter");
|
||||
for tenant in &fake_tenants {
|
||||
tracing::info!("Reporting fake tenant: {:?}", tenant);
|
||||
}
|
||||
if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await {
|
||||
tracing::warn!("Cannot report fake tenants: {}", e);
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("posthog_feature_reporter")),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn feature_store(&self) -> Arc<FeatureStore> {
|
||||
|
||||
@@ -39,6 +39,9 @@ pub struct LocalEvaluationResponse {
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct LocalEvaluationFlag {
|
||||
#[allow(dead_code)]
|
||||
id: u64,
|
||||
team_id: u64,
|
||||
key: String,
|
||||
filters: LocalEvaluationFlagFilters,
|
||||
active: bool,
|
||||
@@ -64,7 +67,7 @@ pub struct LocalEvaluationFlagFilterProperty {
|
||||
operator: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(untagged)]
|
||||
pub enum PostHogFlagFilterPropertyValue {
|
||||
String(String),
|
||||
@@ -107,17 +110,32 @@ impl FeatureStore {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_with_flags(flags: Vec<LocalEvaluationFlag>) -> Self {
|
||||
pub fn new_with_flags(
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<Self, &'static str> {
|
||||
let mut store = Self::new();
|
||||
store.set_flags(flags);
|
||||
store
|
||||
store.set_flags(flags, project_id)?;
|
||||
Ok(store)
|
||||
}
|
||||
|
||||
pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
|
||||
pub fn set_flags(
|
||||
&mut self,
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<(), &'static str> {
|
||||
self.flags.clear();
|
||||
for flag in flags {
|
||||
if let Some(project_id) = project_id {
|
||||
if flag.team_id != project_id {
|
||||
return Err(
|
||||
"Retrieved a spec with different project id, wrong config? Discarding the feature flags.",
|
||||
);
|
||||
}
|
||||
}
|
||||
self.flags.insert(flag.key.clone(), flag);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate a consistent hash for a user ID (e.g., tenant ID).
|
||||
@@ -507,6 +525,13 @@ pub struct PostHogClient {
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct CaptureEvent {
|
||||
pub event: String,
|
||||
pub distinct_id: String,
|
||||
pub properties: serde_json::Value,
|
||||
}
|
||||
|
||||
impl PostHogClient {
|
||||
pub fn new(config: PostHogClientConfig) -> Self {
|
||||
let client = reqwest::Client::new();
|
||||
@@ -527,6 +552,13 @@ impl PostHogClient {
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if the server API key is a feature flag secure API key. This key can only be
|
||||
/// used to fetch the feature flag specs and can only be used on a undocumented API
|
||||
/// endpoint.
|
||||
fn is_feature_flag_secure_api_key(&self) -> bool {
|
||||
self.config.server_api_key.starts_with("phs_")
|
||||
}
|
||||
|
||||
/// Fetch the feature flag specs from the server.
|
||||
///
|
||||
/// This is unfortunately an undocumented API at:
|
||||
@@ -540,10 +572,22 @@ impl PostHogClient {
|
||||
) -> anyhow::Result<LocalEvaluationResponse> {
|
||||
// BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
|
||||
// with bearer token of self.server_api_key
|
||||
let url = format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
);
|
||||
// OR
|
||||
// BASE_URL/api/feature_flag/local_evaluation/
|
||||
// with bearer token of feature flag specific self.server_api_key
|
||||
let url = if self.is_feature_flag_secure_api_key() {
|
||||
// The new feature local evaluation secure API token
|
||||
format!(
|
||||
"{}/api/feature_flag/local_evaluation",
|
||||
self.config.private_api_url
|
||||
)
|
||||
} else {
|
||||
// The old personal API token
|
||||
format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
)
|
||||
};
|
||||
let response = self
|
||||
.client
|
||||
.get(url)
|
||||
@@ -570,12 +614,12 @@ impl PostHogClient {
|
||||
&self,
|
||||
event: &str,
|
||||
distinct_id: &str,
|
||||
properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
|
||||
properties: &serde_json::Value,
|
||||
) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/capture/
|
||||
// with bearer token of self.client_api_key
|
||||
let url = format!("{}/capture/", self.config.public_api_url);
|
||||
self.client
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
@@ -585,6 +629,39 @@ impl PostHogClient {
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/batch/
|
||||
let url = format!("{}/batch/", self.config.public_api_url);
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
"batch": events,
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -763,7 +840,7 @@ mod tests {
|
||||
fn evaluate_multivariate() {
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
@@ -833,7 +910,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new());
|
||||
@@ -889,7 +966,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2024"
|
||||
license = "MIT/Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.20"
|
||||
base64.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
use std::fmt::Write;
|
||||
use std::{io, iter, mem, str};
|
||||
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use hmac::{Hmac, Mac};
|
||||
use rand::{self, Rng};
|
||||
use sha2::digest::FixedOutput;
|
||||
@@ -226,7 +228,7 @@ impl ScramSha256 {
|
||||
|
||||
let (client_key, server_key) = match password {
|
||||
Credentials::Password(password) => {
|
||||
let salt = match base64::decode(parsed.salt) {
|
||||
let salt = match BASE64_STANDARD.decode(parsed.salt) {
|
||||
Ok(salt) => salt,
|
||||
Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
|
||||
};
|
||||
@@ -255,7 +257,7 @@ impl ScramSha256 {
|
||||
let mut cbind_input = vec![];
|
||||
cbind_input.extend(channel_binding.gs2_header().as_bytes());
|
||||
cbind_input.extend(channel_binding.cbind_data());
|
||||
let cbind_input = base64::encode(&cbind_input);
|
||||
let cbind_input = BASE64_STANDARD.encode(&cbind_input);
|
||||
|
||||
self.message.clear();
|
||||
write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap();
|
||||
@@ -272,7 +274,12 @@ impl ScramSha256 {
|
||||
*proof ^= signature;
|
||||
}
|
||||
|
||||
write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap();
|
||||
write!(
|
||||
&mut self.message,
|
||||
",p={}",
|
||||
BASE64_STANDARD.encode(client_proof)
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
self.state = State::Finish {
|
||||
server_key,
|
||||
@@ -306,7 +313,7 @@ impl ScramSha256 {
|
||||
ServerFinalMessage::Verifier(verifier) => verifier,
|
||||
};
|
||||
|
||||
let verifier = match base64::decode(verifier) {
|
||||
let verifier = match BASE64_STANDARD.decode(verifier) {
|
||||
Ok(verifier) => verifier,
|
||||
Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
|
||||
};
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
//! side. This is good because it ensures the cleartext password won't
|
||||
//! end up in logs pg_stat displays, etc.
|
||||
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use hmac::{Hmac, Mac};
|
||||
use rand::RngCore;
|
||||
use sha2::digest::FixedOutput;
|
||||
@@ -83,8 +85,8 @@ pub(crate) async fn scram_sha_256_salt(
|
||||
format!(
|
||||
"SCRAM-SHA-256${}:{}${}:{}",
|
||||
SCRAM_DEFAULT_ITERATIONS,
|
||||
base64::encode(salt),
|
||||
base64::encode(stored_key),
|
||||
base64::encode(server_key)
|
||||
BASE64_STANDARD.encode(salt),
|
||||
BASE64_STANDARD.encode(stored_key),
|
||||
BASE64_STANDARD.encode(server_key)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::{Error, cancel_query_raw, connect_socket};
|
||||
pub(crate) async fn cancel_query<T>(
|
||||
config: Option<SocketConfig>,
|
||||
ssl_mode: SslMode,
|
||||
mut tls: T,
|
||||
tls: T,
|
||||
process_id: i32,
|
||||
secret_key: i32,
|
||||
) -> Result<(), Error>
|
||||
|
||||
@@ -17,7 +17,6 @@ use crate::{Client, Connection, Error};
|
||||
|
||||
/// TLS configuration.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub enum SslMode {
|
||||
/// Do not use TLS.
|
||||
Disable,
|
||||
@@ -231,7 +230,7 @@ impl Config {
|
||||
/// Requires the `runtime` Cargo feature (enabled by default).
|
||||
pub async fn connect<T>(
|
||||
&self,
|
||||
tls: T,
|
||||
tls: &T,
|
||||
) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
|
||||
where
|
||||
T: MakeTlsConnect<TcpStream>,
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::{Client, Config, Connection, Error, RawConnection};
|
||||
|
||||
pub async fn connect<T>(
|
||||
mut tls: T,
|
||||
tls: &T,
|
||||
config: &Config,
|
||||
) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
|
||||
where
|
||||
|
||||
@@ -47,7 +47,7 @@ pub trait MakeTlsConnect<S> {
|
||||
/// Creates a new `TlsConnect`or.
|
||||
///
|
||||
/// The domain name is provided for certificate verification and SNI.
|
||||
fn make_tls_connect(&mut self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
|
||||
fn make_tls_connect(&self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
|
||||
}
|
||||
|
||||
/// An asynchronous function wrapping a stream in a TLS session.
|
||||
@@ -85,7 +85,7 @@ impl<S> MakeTlsConnect<S> for NoTls {
|
||||
type TlsConnect = NoTls;
|
||||
type Error = NoTlsError;
|
||||
|
||||
fn make_tls_connect(&mut self, _: &str) -> Result<NoTls, NoTlsError> {
|
||||
fn make_tls_connect(&self, _: &str) -> Result<NoTls, NoTlsError> {
|
||||
Ok(NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::{env, io};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
|
||||
use azure_storage::StorageCredentials;
|
||||
@@ -37,6 +37,7 @@ use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
|
||||
ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
Version, VersionKind,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -405,6 +406,39 @@ impl AzureBlobStorage {
|
||||
pub fn container_name(&self) -> &str {
|
||||
&self.container_name
|
||||
}
|
||||
|
||||
async fn list_versions_with_permit(
|
||||
&self,
|
||||
_permit: &tokio::sync::SemaphorePermit<'_>,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::VersionListing, DownloadError> {
|
||||
let customize_builder = |mut builder: ListBlobsBuilder| {
|
||||
builder = builder.include_versions(true);
|
||||
// We do not return this info back to `VersionListing` yet.
|
||||
builder = builder.include_deleted(true);
|
||||
builder
|
||||
};
|
||||
let kind = RequestKind::ListVersions;
|
||||
|
||||
let mut stream = std::pin::pin!(self.list_streaming_for_fn(
|
||||
prefix,
|
||||
mode,
|
||||
max_keys,
|
||||
cancel,
|
||||
kind,
|
||||
customize_builder
|
||||
));
|
||||
let mut combined: crate::VersionListing =
|
||||
stream.next().await.expect("At least one item required")?;
|
||||
while let Some(list) = stream.next().await {
|
||||
let list = list?;
|
||||
combined.versions.extend(list.versions.into_iter());
|
||||
}
|
||||
Ok(combined)
|
||||
}
|
||||
}
|
||||
|
||||
trait ListingCollector {
|
||||
@@ -488,27 +522,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> std::result::Result<crate::VersionListing, DownloadError> {
|
||||
let customize_builder = |mut builder: ListBlobsBuilder| {
|
||||
builder = builder.include_versions(true);
|
||||
builder
|
||||
};
|
||||
let kind = RequestKind::ListVersions;
|
||||
|
||||
let mut stream = std::pin::pin!(self.list_streaming_for_fn(
|
||||
prefix,
|
||||
mode,
|
||||
max_keys,
|
||||
cancel,
|
||||
kind,
|
||||
customize_builder
|
||||
));
|
||||
let mut combined: crate::VersionListing =
|
||||
stream.next().await.expect("At least one item required")?;
|
||||
while let Some(list) = stream.next().await {
|
||||
let list = list?;
|
||||
combined.versions.extend(list.versions.into_iter());
|
||||
}
|
||||
Ok(combined)
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
@@ -803,14 +820,159 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
_complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected "
|
||||
.to_string()
|
||||
+ "for some specific files. If a file gets deleted but then overwritten and we want to recover "
|
||||
+ "to the time during the file was not present, this functionality will recover the file. Only "
|
||||
+ "use the functionality for services that can tolerate this. For example, recovering a state of the "
|
||||
+ "pageserver tenants.";
|
||||
tracing::error!("{}", msg);
|
||||
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mode = ListingMode::NoDelimiter;
|
||||
let version_listing = self
|
||||
.list_versions_with_permit(&permit, prefix, mode, None, cancel)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::Other(e) => TimeTravelError::Other(e),
|
||||
DownloadError::Cancelled => TimeTravelError::Cancelled,
|
||||
other => TimeTravelError::Other(other.into()),
|
||||
})?;
|
||||
let versions_and_deletes = version_listing.versions;
|
||||
|
||||
tracing::info!(
|
||||
"Built list for time travel with {} versions and deletions",
|
||||
versions_and_deletes.len()
|
||||
);
|
||||
|
||||
// Work on the list of references instead of the objects directly,
|
||||
// otherwise we get lifetime errors in the sort_by_key call below.
|
||||
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
|
||||
|
||||
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in &versions_and_deletes {
|
||||
let Version { key, .. } = &vd;
|
||||
let version_id = vd.version_id().map(|v| v.0.as_str());
|
||||
if version_id == Some("null") {
|
||||
return Err(TimeTravelError::Other(anyhow!(
|
||||
"Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values"
|
||||
)));
|
||||
}
|
||||
tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);
|
||||
|
||||
vds_for_key.entry(key).or_default().push(vd);
|
||||
}
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
|
||||
|
||||
for (key, versions) in vds_for_key {
|
||||
let last_vd = versions.last().unwrap();
|
||||
let key = self.relative_path_to_name(key);
|
||||
if last_vd.last_modified > done_if_after {
|
||||
tracing::debug!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::debug!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::debug!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
Version {
|
||||
kind: VersionKind::Version(version_id),
|
||||
..
|
||||
} => {
|
||||
let source_url = format!(
|
||||
"{}/{}?versionid={}",
|
||||
self.client
|
||||
.url()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow!("{e}")))?,
|
||||
key,
|
||||
version_id.0
|
||||
);
|
||||
tracing::debug!(
|
||||
"Promoting old version {} for {key} at {}...",
|
||||
version_id.0,
|
||||
source_url
|
||||
);
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let blob_client = self.client.blob_client(key.clone());
|
||||
let op = blob_client.copy(Url::from_str(&source_url).unwrap());
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"copying object version for time_travel_recover",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| TimeTravelError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
tracing::info!(?version_id, %key, "Copied old version in Azure blob storage");
|
||||
}
|
||||
Version {
|
||||
kind: VersionKind::DeletionMarker,
|
||||
..
|
||||
} => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd.kind, VersionKind::DeletionMarker) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::debug!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::debug!("Deleting {key}...");
|
||||
|
||||
self.delete(&RemotePath::from_string(&key).unwrap(), cancel)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
// delete_oid0 will use TimeoutOrCancel
|
||||
if TimeoutOrCancel::caused_by_cancel(&e) {
|
||||
TimeTravelError::Cancelled
|
||||
} else {
|
||||
TimeTravelError::Other(e)
|
||||
}
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -440,6 +440,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
@@ -651,22 +652,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -610,6 +610,7 @@ impl RemoteStorage for LocalFs {
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
_complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
@@ -981,22 +981,16 @@ impl RemoteStorage for S3Bucket {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
|
||||
|
||||
let mode = ListingMode::NoDelimiter;
|
||||
let version_listing = self
|
||||
.list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
|
||||
.list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::Other(e) => TimeTravelError::Other(e),
|
||||
@@ -1022,6 +1016,7 @@ impl RemoteStorage for S3Bucket {
|
||||
let Version { key, .. } = &vd;
|
||||
let version_id = vd.version_id().map(|v| v.0.as_str());
|
||||
if version_id == Some("null") {
|
||||
// TODO: check the behavior of using the SDK on a non-versioned container
|
||||
return Err(TimeTravelError::Other(anyhow!(
|
||||
"Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values"
|
||||
|
||||
@@ -240,11 +240,12 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, &cancel)
|
||||
.time_travel_recover(None, t2, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
@@ -173,7 +173,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, &cancel)
|
||||
.time_travel_recover(None, t1, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
@@ -189,7 +189,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, &cancel)
|
||||
.time_travel_recover(None, t0, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
|
||||
@@ -13,7 +13,7 @@ use utils::pageserver_feedback::PageserverFeedback;
|
||||
use crate::membership::Configuration;
|
||||
use crate::{ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct SafekeeperStatus {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
@@ -86,14 +86,6 @@ pub enum GateError {
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
impl GateError {
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
GateError::GateClosed => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Gate {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
|
||||
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
//!
|
||||
//! # local timeline dir
|
||||
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg
|
||||
//!
|
||||
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
|
||||
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
|
||||
@@ -81,7 +81,11 @@ fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T
|
||||
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
|
||||
// Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename:
|
||||
// 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp
|
||||
let lsns = split[1].split('.').collect::<Vec<&str>>()[0];
|
||||
let mut lsns: Vec<&str> = lsns.split('-').collect();
|
||||
|
||||
// The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ use pageserver::{page_cache, virtual_file};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
use crate::layer_map_analyzer::{LayerFile, parse_filename};
|
||||
|
||||
#[derive(Subcommand)]
|
||||
pub(crate) enum LayerCmd {
|
||||
@@ -38,6 +38,8 @@ pub(crate) enum LayerCmd {
|
||||
/// The id from list-layer command
|
||||
id: usize,
|
||||
},
|
||||
/// Dump all information of a layer file locally
|
||||
DumpLayerLocal { path: PathBuf },
|
||||
RewriteSummary {
|
||||
layer_file_path: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -131,15 +133,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
|
||||
for (idx, layer_file) in to_print {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
print_layer_file(idx, &layer_file);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -159,16 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
if *id == idx {
|
||||
// TODO(chi): dedup code
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
print_layer_file(idx, &layer_file);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path(), &ctx).await?;
|
||||
@@ -183,6 +168,18 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::DumpLayerLocal { path } => {
|
||||
if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) {
|
||||
print_layer_file(0, &layer_file);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(path, &ctx).await?;
|
||||
} else {
|
||||
read_image_file(path, &ctx).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::RewriteSummary {
|
||||
layer_file_path,
|
||||
new_tenant_id,
|
||||
@@ -247,3 +244,15 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn print_layer_file(idx: usize, layer_file: &LayerFile) {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
|
||||
let cancel = CancellationToken::new();
|
||||
// Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
|
||||
// need to limit the number of versions we are going to delete.
|
||||
storage
|
||||
.unwrap()
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
|
||||
@@ -102,10 +102,10 @@ message CheckRelExistsResponse {
|
||||
bool exists = 1;
|
||||
}
|
||||
|
||||
// Requests a base backup at a given LSN.
|
||||
// Requests a base backup.
|
||||
message GetBaseBackupRequest {
|
||||
// The LSN to fetch a base backup at.
|
||||
ReadLsn read_lsn = 1;
|
||||
// The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver.
|
||||
uint64 lsn = 1;
|
||||
// If true, logical replication slots will not be created.
|
||||
bool replica = 2;
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
|
||||
use crate::proto;
|
||||
|
||||
/// A protocol error. Typically returned via try_from() or try_into().
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[derive(thiserror::Error, Clone, Debug)]
|
||||
pub enum ProtocolError {
|
||||
#[error("field '{0}' has invalid value '{1}'")]
|
||||
Invalid(&'static str, String),
|
||||
@@ -182,33 +182,28 @@ impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
|
||||
}
|
||||
}
|
||||
|
||||
/// Requests a base backup at a given LSN.
|
||||
/// Requests a base backup.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetBaseBackupRequest {
|
||||
/// The LSN to fetch a base backup at.
|
||||
pub read_lsn: ReadLsn,
|
||||
/// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver.
|
||||
pub lsn: Option<Lsn>,
|
||||
/// If true, logical replication slots will not be created.
|
||||
pub replica: bool,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: pb
|
||||
.read_lsn
|
||||
.ok_or(ProtocolError::Missing("read_lsn"))?
|
||||
.try_into()?,
|
||||
impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
fn from(pb: proto::GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
|
||||
replica: pb.replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
fn from(request: GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
lsn: request.lsn.unwrap_or_default().0,
|
||||
replica: request.replica,
|
||||
}
|
||||
}
|
||||
@@ -422,6 +417,39 @@ impl From<GetPageResponse> for proto::GetPageResponse {
|
||||
}
|
||||
}
|
||||
|
||||
impl GetPageResponse {
|
||||
/// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a
|
||||
/// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a
|
||||
/// GetPageResponse with a non-OK status code instead.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn try_from_status(
|
||||
status: tonic::Status,
|
||||
request_id: RequestID,
|
||||
) -> Result<Self, tonic::Status> {
|
||||
// We shouldn't see an OK status here, because we're emitting an error.
|
||||
debug_assert_ne!(status.code(), tonic::Code::Ok);
|
||||
if status.code() == tonic::Code::Ok {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected OK status: {status:?}",
|
||||
)));
|
||||
}
|
||||
|
||||
// If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request
|
||||
// error and we should return a tonic::Status to terminate the stream.
|
||||
let Ok(status_code) = status.code().try_into() else {
|
||||
return Err(status);
|
||||
};
|
||||
|
||||
// Return a GetPageResponse for the status.
|
||||
Ok(Self {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage response status code.
|
||||
///
|
||||
/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
|
||||
@@ -485,8 +513,42 @@ impl From<GetPageStatusCode> for i32 {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<tonic::Code> for GetPageStatusCode {
|
||||
type Error = tonic::Code;
|
||||
|
||||
fn try_from(code: tonic::Code) -> Result<Self, Self::Error> {
|
||||
use tonic::Code;
|
||||
|
||||
let status_code = match code {
|
||||
Code::Ok => Self::Ok,
|
||||
|
||||
// These are per-request errors, which should be returned as GetPageResponses.
|
||||
Code::AlreadyExists => Self::InvalidRequest,
|
||||
Code::DataLoss => Self::InternalError,
|
||||
Code::FailedPrecondition => Self::InvalidRequest,
|
||||
Code::InvalidArgument => Self::InvalidRequest,
|
||||
Code::Internal => Self::InternalError,
|
||||
Code::NotFound => Self::NotFound,
|
||||
Code::OutOfRange => Self::InvalidRequest,
|
||||
Code::ResourceExhausted => Self::SlowDown,
|
||||
|
||||
// These should terminate the stream by returning a tonic::Status.
|
||||
Code::Aborted
|
||||
| Code::Cancelled
|
||||
| Code::DeadlineExceeded
|
||||
| Code::PermissionDenied
|
||||
| Code::Unauthenticated
|
||||
| Code::Unavailable
|
||||
| Code::Unimplemented
|
||||
| Code::Unknown => return Err(code),
|
||||
};
|
||||
Ok(status_code)
|
||||
}
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetRelSizeRequest {
|
||||
pub read_lsn: ReadLsn,
|
||||
pub rel: RelTag,
|
||||
@@ -530,6 +592,7 @@ impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
|
||||
}
|
||||
|
||||
/// Requests an SLRU segment. Only valid on shard 0, other shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetSlruSegmentRequest {
|
||||
pub read_lsn: ReadLsn,
|
||||
pub kind: SlruKind,
|
||||
|
||||
@@ -9,6 +9,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -8,12 +8,12 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{
|
||||
PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest,
|
||||
};
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api::proto;
|
||||
use rand::prelude::*;
|
||||
@@ -77,6 +77,16 @@ pub(crate) struct Args {
|
||||
#[clap(long, default_value = "1")]
|
||||
queue_depth: NonZeroUsize,
|
||||
|
||||
/// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres
|
||||
/// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and
|
||||
/// 1 queue depth.
|
||||
///
|
||||
/// The libpq protocol does not support client-side batching, and will submit batches as many
|
||||
/// individual requests, in the hope that the server will batch them. Each batch still counts as
|
||||
/// 1 RPS and 1 queue depth.
|
||||
#[clap(long, default_value = "1")]
|
||||
batch_size: NonZeroUsize,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
|
||||
@@ -392,7 +402,16 @@ async fn run_worker(
|
||||
shared_state.start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = VecDeque::new();
|
||||
let mut req_id = 0;
|
||||
let batch_size: usize = args.batch_size.into();
|
||||
|
||||
// Track inflight requests by request ID and start time. This times the request duration, and
|
||||
// ensures responses match requests. We don't expect responses back in any particular order.
|
||||
//
|
||||
// NB: this does not check that all requests received a response, because we don't wait for the
|
||||
// inflight requests to complete when the duration elapses.
|
||||
let mut inflight: HashMap<u64, Instant> = HashMap::new();
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
@@ -408,36 +427,72 @@ async fn run_worker(
|
||||
}
|
||||
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
req_id += 1;
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let (req_lsn, mod_lsn, rel, blks) = {
|
||||
/// Converts a compact i128 key to a relation tag and block number.
|
||||
fn key_to_block(key: i128) -> (RelTag, u32) {
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
key.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above")
|
||||
}
|
||||
|
||||
// Pick a random page from a random relation.
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
},
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
let (rel_tag, block_no) = key_to_block(key);
|
||||
|
||||
let mut blks = VecDeque::with_capacity(batch_size);
|
||||
blks.push_back(block_no);
|
||||
|
||||
// If requested, populate a batch of sequential pages. This is how Postgres will
|
||||
// request page batches (e.g. prefetches). If we hit the end of the relation, we
|
||||
// grow the batch towards the start too.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key + i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_back(b)
|
||||
}
|
||||
|
||||
if blks.len() < batch_size {
|
||||
// Grow batch backwards if needed.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key - i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_front(b)
|
||||
}
|
||||
}
|
||||
|
||||
// We assume that the entire batch can fit within the relation.
|
||||
assert_eq!(blks.len(), batch_size, "incomplete batch");
|
||||
|
||||
let req_lsn = if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
};
|
||||
(req_lsn, r.timeline_lsn, rel_tag, blks.into())
|
||||
};
|
||||
client.send_get_page(req).await.unwrap();
|
||||
inflight.push_back(start);
|
||||
client
|
||||
.send_get_page(req_id, req_lsn, mod_lsn, rel, blks)
|
||||
.await
|
||||
.unwrap();
|
||||
let old = inflight.insert(req_id, start);
|
||||
assert!(old.is_none(), "duplicate request ID {req_id}");
|
||||
}
|
||||
|
||||
let start = inflight.pop_front().unwrap();
|
||||
client.recv_get_page().await.unwrap();
|
||||
let (req_id, pages) = client.recv_get_page().await.unwrap();
|
||||
assert_eq!(pages.len(), batch_size, "unexpected page count");
|
||||
assert!(pages.iter().all(|p| !p.is_empty()), "empty page");
|
||||
let start = inflight
|
||||
.remove(&req_id)
|
||||
.expect("response for unknown request ID");
|
||||
let end = Instant::now();
|
||||
shared_state.live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
@@ -467,15 +522,24 @@ async fn run_worker(
|
||||
#[async_trait]
|
||||
trait Client: Send {
|
||||
/// Sends an asynchronous GetPage request to the pageserver.
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>;
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Receives the next GetPage response from the pageserver.
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse>;
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)>;
|
||||
}
|
||||
|
||||
/// A libpq-based Pageserver client.
|
||||
struct LibpqClient {
|
||||
inner: pageserver_client::page_service::PagestreamClient,
|
||||
// Track sent batches, so we know how many responses to expect.
|
||||
batch_sizes: VecDeque<usize>,
|
||||
}
|
||||
|
||||
impl LibpqClient {
|
||||
@@ -484,18 +548,55 @@ impl LibpqClient {
|
||||
.await?
|
||||
.pagestream(ttid.tenant_id, ttid.timeline_id)
|
||||
.await?;
|
||||
Ok(Self { inner })
|
||||
Ok(Self {
|
||||
inner,
|
||||
batch_sizes: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for LibpqClient {
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.inner.getpage_send(req).await
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
// libpq doesn't support client-side batches, so we send a bunch of individual requests
|
||||
// instead in the hope that the server will batch them for us. We use the same request ID
|
||||
// for all, because we'll return a single batch response.
|
||||
self.batch_sizes.push_back(blks.len());
|
||||
for blkno in blks {
|
||||
let req = PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: req_id,
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since: mod_lsn,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
};
|
||||
self.inner.getpage_send(req).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.inner.getpage_recv().await
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let batch_size = self.batch_sizes.pop_front().unwrap();
|
||||
let mut batch = Vec::with_capacity(batch_size);
|
||||
let mut req_id = None;
|
||||
for _ in 0..batch_size {
|
||||
let resp = self.inner.getpage_recv().await?;
|
||||
if req_id.is_none() {
|
||||
req_id = Some(resp.req.hdr.reqid);
|
||||
}
|
||||
assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch");
|
||||
batch.push(resp.page);
|
||||
}
|
||||
Ok((req_id.unwrap(), batch))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,31 +633,35 @@ impl GrpcClient {
|
||||
|
||||
#[async_trait]
|
||||
impl Client for GrpcClient {
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = proto::GetPageRequest {
|
||||
request_id: 0,
|
||||
request_id: req_id,
|
||||
request_class: proto::GetPageClass::Normal as i32,
|
||||
read_lsn: Some(proto::ReadLsn {
|
||||
request_lsn: req.hdr.request_lsn.0,
|
||||
not_modified_since_lsn: req.hdr.not_modified_since.0,
|
||||
request_lsn: req_lsn.0,
|
||||
not_modified_since_lsn: mod_lsn.0,
|
||||
}),
|
||||
rel: Some(req.rel.into()),
|
||||
block_number: vec![req.blkno],
|
||||
rel: Some(rel.into()),
|
||||
block_number: blks,
|
||||
};
|
||||
self.req_tx.send(req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.resp_rx.message().await?.unwrap();
|
||||
anyhow::ensure!(
|
||||
resp.status_code == proto::GetPageStatusCode::Ok as i32,
|
||||
"unexpected status code: {}",
|
||||
resp.status_code
|
||||
);
|
||||
Ok(PagestreamGetPageResponse {
|
||||
page: resp.page_image[0].clone(),
|
||||
req: PagestreamGetPageRequest::default(), // dummy
|
||||
})
|
||||
Ok((resp.request_id, resp.page_image))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
@@ -18,7 +19,10 @@ use utils::{
|
||||
use crate::{
|
||||
basebackup::send_basebackup_tarball,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
|
||||
metrics::{
|
||||
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
|
||||
BASEBACKUP_CACHE_SIZE,
|
||||
},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
Timeline,
|
||||
@@ -35,8 +39,13 @@ pub struct BasebackupPrepareRequest {
|
||||
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
|
||||
|
||||
type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
|
||||
type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
|
||||
#[derive(Clone)]
|
||||
struct CacheEntry {
|
||||
/// LSN at which the basebackup was taken.
|
||||
lsn: Lsn,
|
||||
/// Size of the basebackup archive in bytes.
|
||||
size_bytes: u64,
|
||||
}
|
||||
|
||||
/// BasebackupCache stores cached basebackup archives for timelines on local disk.
|
||||
///
|
||||
@@ -52,21 +61,12 @@ type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
|
||||
/// and ~1 RPS for get requests.
|
||||
pub struct BasebackupCache {
|
||||
data_dir: Utf8PathBuf,
|
||||
config: BasebackupCacheConfig,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remove_entry_sender: BasebackupRemoveEntrySender,
|
||||
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,
|
||||
|
||||
read_hit_count: GenericCounter<AtomicU64>,
|
||||
read_miss_count: GenericCounter<AtomicU64>,
|
||||
read_err_count: GenericCounter<AtomicU64>,
|
||||
|
||||
prepare_ok_count: GenericCounter<AtomicU64>,
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
prepare_err_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BasebackupCache {
|
||||
@@ -82,35 +82,32 @@ impl BasebackupCache {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let enabled = config.is_some();
|
||||
|
||||
let cache = Arc::new(BasebackupCache {
|
||||
data_dir,
|
||||
config: config.unwrap_or_default(),
|
||||
tenant_manager,
|
||||
remove_entry_sender,
|
||||
|
||||
entries: std::sync::Mutex::new(HashMap::new()),
|
||||
|
||||
cancel,
|
||||
|
||||
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
|
||||
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
|
||||
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
|
||||
|
||||
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
|
||||
});
|
||||
|
||||
if enabled {
|
||||
runtime_handle.spawn(
|
||||
cache
|
||||
.clone()
|
||||
.background(prepare_receiver, remove_entry_receiver),
|
||||
);
|
||||
if let Some(config) = config {
|
||||
let background = BackgroundTask {
|
||||
c: cache.clone(),
|
||||
|
||||
config,
|
||||
tenant_manager,
|
||||
cancel,
|
||||
|
||||
entry_count: 0,
|
||||
total_size_bytes: 0,
|
||||
|
||||
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
|
||||
};
|
||||
runtime_handle.spawn(background.run(prepare_receiver));
|
||||
}
|
||||
|
||||
cache
|
||||
@@ -128,7 +125,7 @@ impl BasebackupCache {
|
||||
) -> Option<tokio::fs::File> {
|
||||
// Fast path. Check if the entry exists using the in-memory state.
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
|
||||
if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
|
||||
self.read_miss_count.inc();
|
||||
return None;
|
||||
}
|
||||
@@ -166,6 +163,42 @@ impl BasebackupCache {
|
||||
self.data_dir
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
}
|
||||
|
||||
/// The background task that does the job to prepare basebackups
|
||||
/// and manage the cache entries on disk.
|
||||
/// It is a separate struct from BasebackupCache to allow holding
|
||||
/// a mutable reference to this state without a mutex lock,
|
||||
/// while BasebackupCache is referenced by the clients.
|
||||
struct BackgroundTask {
|
||||
c: Arc<BasebackupCache>,
|
||||
|
||||
config: BasebackupCacheConfig,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
|
||||
/// Number of the entries in the cache.
|
||||
/// This counter is used for metrics and applying cache limits.
|
||||
/// It generally should be equal to c.entries.len(), but it's calculated
|
||||
/// pessimistically for abnormal situations: if we encountered some errors
|
||||
/// during removing the entry from disk, we won't decrement this counter to
|
||||
/// make sure that we don't exceed the limit with "trashed" files on the disk.
|
||||
/// It will also count files in the data_dir that are not valid cache entries.
|
||||
entry_count: usize,
|
||||
/// Total size of all the entries on the disk.
|
||||
/// This counter is used for metrics and applying cache limits.
|
||||
/// Similar to entry_count, it is calculated pessimistically for abnormal situations.
|
||||
total_size_bytes: u64,
|
||||
|
||||
prepare_ok_count: GenericCounter<AtomicU64>,
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
prepare_err_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BackgroundTask {
|
||||
fn tmp_dir(&self) -> Utf8PathBuf {
|
||||
self.c.data_dir.join("tmp")
|
||||
}
|
||||
|
||||
fn entry_tmp_path(
|
||||
&self,
|
||||
@@ -173,9 +206,8 @@ impl BasebackupCache {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Utf8PathBuf {
|
||||
self.data_dir
|
||||
.join("tmp")
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
self.tmp_dir()
|
||||
.join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
|
||||
@@ -194,18 +226,21 @@ impl BasebackupCache {
|
||||
Some((tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
async fn cleanup(&self) -> anyhow::Result<()> {
|
||||
// Cleanup tmp directory.
|
||||
let tmp_dir = self.data_dir.join("tmp");
|
||||
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
|
||||
while let Some(dir_entry) = tmp_dir.next_entry().await? {
|
||||
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
|
||||
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
|
||||
}
|
||||
// Recreate the tmp directory to clear all files in it.
|
||||
async fn clean_tmp_dir(&self) -> anyhow::Result<()> {
|
||||
let tmp_dir = self.tmp_dir();
|
||||
if tmp_dir.exists() {
|
||||
tokio::fs::remove_dir_all(&tmp_dir).await?;
|
||||
}
|
||||
tokio::fs::create_dir_all(&tmp_dir).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Remove outdated entries.
|
||||
let entries_old = self.entries.lock().unwrap().clone();
|
||||
async fn cleanup(&mut self) -> anyhow::Result<()> {
|
||||
self.clean_tmp_dir().await?;
|
||||
|
||||
// Leave only up-to-date entries.
|
||||
let entries_old = self.c.entries.lock().unwrap().clone();
|
||||
let mut entries_new = HashMap::new();
|
||||
for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
@@ -218,43 +253,42 @@ impl BasebackupCache {
|
||||
|
||||
for timeline in tenant.list_timelines() {
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
|
||||
if let Some(&entry_lsn) = entries_old.get(&tti) {
|
||||
if timeline.get_last_record_lsn() <= entry_lsn {
|
||||
entries_new.insert(tti, entry_lsn);
|
||||
if let Some(entry) = entries_old.get(&tti) {
|
||||
if timeline.get_last_record_lsn() <= entry.lsn {
|
||||
entries_new.insert(tti, entry.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (&tti, &lsn) in entries_old.iter() {
|
||||
// Try to remove all entries that are not up-to-date.
|
||||
for (&tti, entry) in entries_old.iter() {
|
||||
if !entries_new.contains_key(&tti) {
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
|
||||
.unwrap();
|
||||
self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries_new;
|
||||
// Note: BackgroundTask is the only writer for self.c.entries,
|
||||
// so it couldn't have been modified concurrently.
|
||||
*self.c.entries.lock().unwrap() = entries_new;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn on_startup(&self) -> anyhow::Result<()> {
|
||||
// Create data_dir and tmp directory if they do not exist.
|
||||
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
|
||||
async fn on_startup(&mut self) -> anyhow::Result<()> {
|
||||
// Create data_dir if it does not exist.
|
||||
tokio::fs::create_dir_all(&self.c.data_dir)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to create basebackup cache data_dir {:?}: {:?}",
|
||||
self.data_dir,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
.context("Failed to create basebackup cache data directory")?;
|
||||
|
||||
self.clean_tmp_dir()
|
||||
.await
|
||||
.context("Failed to clean tmp directory")?;
|
||||
|
||||
// Read existing entries from the data_dir and add them to in-memory state.
|
||||
let mut entries = HashMap::new();
|
||||
let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
|
||||
let mut entries = HashMap::<TenantTimelineId, CacheEntry>::new();
|
||||
let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?;
|
||||
while let Some(dir_entry) = dir.next_entry().await? {
|
||||
let filename = dir_entry.file_name();
|
||||
|
||||
@@ -263,33 +297,43 @@ impl BasebackupCache {
|
||||
continue;
|
||||
}
|
||||
|
||||
let size_bytes = dir_entry
|
||||
.metadata()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e)
|
||||
})?
|
||||
.len();
|
||||
|
||||
self.entry_count += 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes += size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
|
||||
let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
|
||||
let Some((tenant_id, timeline_id, lsn)) = parsed else {
|
||||
tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
|
||||
continue;
|
||||
};
|
||||
|
||||
let cur_entry = CacheEntry { lsn, size_bytes };
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
use std::collections::hash_map::Entry::*;
|
||||
|
||||
match entries.entry(tti) {
|
||||
Occupied(mut entry) => {
|
||||
let entry_lsn = *entry.get();
|
||||
let found_entry = entry.get();
|
||||
// Leave only the latest entry, remove the old one.
|
||||
if lsn < entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
))?;
|
||||
} else if lsn > entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
entry_lsn,
|
||||
))?;
|
||||
entry.insert(lsn);
|
||||
if cur_entry.lsn < found_entry.lsn {
|
||||
self.try_remove_entry(tenant_id, timeline_id, &cur_entry)
|
||||
.await;
|
||||
} else if cur_entry.lsn > found_entry.lsn {
|
||||
self.try_remove_entry(tenant_id, timeline_id, found_entry)
|
||||
.await;
|
||||
entry.insert(cur_entry);
|
||||
} else {
|
||||
// Two different filenames parsed to the same timline_id and LSN.
|
||||
// Should never happen.
|
||||
@@ -300,22 +344,17 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(lsn);
|
||||
entry.insert(cur_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries;
|
||||
*self.c.entries.lock().unwrap() = entries;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn background(
|
||||
self: Arc<Self>,
|
||||
mut prepare_receiver: BasebackupPrepareReceiver,
|
||||
mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
|
||||
) {
|
||||
async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) {
|
||||
// Panic in the background is a safe fallback.
|
||||
// It will drop receivers and the cache will be effectively disabled.
|
||||
self.on_startup()
|
||||
@@ -338,11 +377,6 @@ impl BasebackupCache {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Some(req) = remove_entry_receiver.recv() => {
|
||||
if let Err(e) = tokio::fs::remove_file(req).await {
|
||||
tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
|
||||
}
|
||||
}
|
||||
_ = cleanup_ticker.tick() => {
|
||||
self.cleanup().await.unwrap_or_else(|e| {
|
||||
tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
|
||||
@@ -356,6 +390,67 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to remove an entry from disk.
|
||||
/// The caller is responsible for removing the entry from the in-memory state.
|
||||
/// Updates size counters and corresponding metrics.
|
||||
/// Ignores the filesystem errors as not-so-important, but the size counters
|
||||
/// are not decremented in this case, so the file will continue to be counted
|
||||
/// towards the size limits.
|
||||
async fn try_remove_entry(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
entry: &CacheEntry,
|
||||
) {
|
||||
let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn);
|
||||
|
||||
match tokio::fs::remove_file(&entry_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
entry.lsn,
|
||||
e
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
self.entry_count -= 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes -= entry.size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
}
|
||||
|
||||
/// Insert the cache entry into in-memory state and update the size counters.
|
||||
/// Assumes that the file for the entry already exists on disk.
|
||||
/// If the entry already exists with previous LSN, it will be removed.
|
||||
async fn upsert_entry(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
entry: CacheEntry,
|
||||
) {
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
self.entry_count += 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes += entry.size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
|
||||
let old_entry = self.c.entries.lock().unwrap().insert(tti, entry);
|
||||
|
||||
if let Some(old_entry) = old_entry {
|
||||
self.try_remove_entry(tenant_id, timeline_id, &old_entry)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare a basebackup for the given timeline.
|
||||
///
|
||||
/// If the basebackup already exists with a higher LSN or the timeline already
|
||||
@@ -364,7 +459,7 @@ impl BasebackupCache {
|
||||
/// The basebackup is prepared in a temporary directory and then moved to the final
|
||||
/// location to make the operation atomic.
|
||||
async fn prepare_basebackup(
|
||||
&self,
|
||||
&mut self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
req_lsn: Lsn,
|
||||
@@ -378,30 +473,44 @@ impl BasebackupCache {
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
// TODO(diko): I don't think we will hit the limit,
|
||||
// but if we do, it makes sense to try to evict oldest entries. here
|
||||
if self.entry_count >= self.config.max_size_entries {
|
||||
tracing::info!(
|
||||
%tenant_shard_id,
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full (max_size_entries), skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.total_size_bytes >= self.config.max_total_size_bytes {
|
||||
tracing::info!(
|
||||
%tenant_shard_id,
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full (max_total_size_bytes), skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
{
|
||||
let entries = self.entries.lock().unwrap();
|
||||
if let Some(&entry_lsn) = entries.get(&tti) {
|
||||
if entry_lsn >= req_lsn {
|
||||
let entries = self.c.entries.lock().unwrap();
|
||||
if let Some(entry) = entries.get(&tti) {
|
||||
if entry.lsn >= req_lsn {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
%entry_lsn,
|
||||
%entry.lsn,
|
||||
"Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if entries.len() as i64 >= self.config.max_size_entries {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let tenant = self
|
||||
@@ -437,47 +546,52 @@ impl BasebackupCache {
|
||||
.prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
|
||||
.await;
|
||||
|
||||
if let Err(err) = res {
|
||||
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
|
||||
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
|
||||
match tokio::fs::remove_file(&entry_tmp_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
|
||||
let entry = match res {
|
||||
Ok(entry) => entry,
|
||||
Err(err) => {
|
||||
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
|
||||
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
|
||||
match tokio::fs::remove_file(&entry_tmp_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
|
||||
}
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
|
||||
// Move the tmp file to the final location atomically.
|
||||
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
// The tmp file is fsynced, so it's guaranteed that we will not have a partial file
|
||||
// in the main directory.
|
||||
// It's not necessary to fsync the inode after renaming, because the worst case is that
|
||||
// the rename operation will be rolled back on the disk failure, the entry will disappear
|
||||
// from the main directory, and the entry access will cause a cache miss.
|
||||
let entry_path = self
|
||||
.c
|
||||
.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
|
||||
|
||||
let mut entries = self.entries.lock().unwrap();
|
||||
if let Some(old_lsn) = entries.insert(tti, req_lsn) {
|
||||
// Remove the old entry if it exists.
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
|
||||
.unwrap();
|
||||
}
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry)
|
||||
.await;
|
||||
|
||||
self.prepare_ok_count.inc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepares a basebackup in a temporary file.
|
||||
/// Guarantees that the tmp file is fsynced before returning.
|
||||
async fn prepare_basebackup_tmp(
|
||||
&self,
|
||||
emptry_tmp_path: &Utf8Path,
|
||||
entry_tmp_path: &Utf8Path,
|
||||
timeline: &Arc<Timeline>,
|
||||
req_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<CacheEntry> {
|
||||
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
|
||||
let ctx = ctx.with_scope_timeline(timeline);
|
||||
|
||||
let file = tokio::fs::File::create(emptry_tmp_path).await?;
|
||||
let file = tokio::fs::File::create(entry_tmp_path).await?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
@@ -513,6 +627,12 @@ impl BasebackupCache {
|
||||
writer.flush().await?;
|
||||
writer.into_inner().sync_all().await?;
|
||||
|
||||
Ok(())
|
||||
// TODO(diko): we can count it via Writer wrapper instead of a syscall.
|
||||
let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len();
|
||||
|
||||
Ok(CacheEntry {
|
||||
lsn: req_lsn,
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use pageserver::deletion_queue::DeletionQueue;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::feature_resolver::FeatureResolver;
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::page_service::GrpcPageServiceHandler;
|
||||
use pageserver::task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
};
|
||||
@@ -158,7 +159,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// (maybe we should automate this with a visitor?).
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
|
||||
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
|
||||
info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
|
||||
@@ -573,7 +573,8 @@ fn start_pageserver(
|
||||
tokio::sync::mpsc::unbounded_channel();
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let background_purges = mgr::BackgroundPurges::default();
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
|
||||
let tenant_manager = mgr::init(
|
||||
conf,
|
||||
background_purges.clone(),
|
||||
TenantSharedResources {
|
||||
@@ -584,10 +585,10 @@ fn start_pageserver(
|
||||
basebackup_prepare_sender,
|
||||
feature_resolver,
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
))?;
|
||||
);
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;
|
||||
|
||||
let basebackup_cache = BasebackupCache::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
@@ -815,10 +816,11 @@ fn start_pageserver(
|
||||
// necessary?
|
||||
let mut page_service_grpc = None;
|
||||
if let Some(grpc_listener) = grpc_listener {
|
||||
page_service_grpc = Some(page_service::spawn_grpc(
|
||||
page_service_grpc = Some(GrpcPageServiceHandler::spawn(
|
||||
tenant_manager.clone(),
|
||||
grpc_auth,
|
||||
otel_guard.as_ref().map(|g| g.dispatch.clone()),
|
||||
conf.get_vectored_concurrent_io,
|
||||
grpc_listener,
|
||||
)?);
|
||||
}
|
||||
|
||||
@@ -27,7 +27,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -211,8 +210,6 @@ pub struct PageServerConf {
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
@@ -421,7 +418,6 @@ impl PageServerConf {
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
@@ -484,7 +480,6 @@ impl PageServerConf {
|
||||
import_pgdata_upcall_api,
|
||||
import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
|
||||
import_pgdata_aws_endpoint_url,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
|
||||
@@ -159,14 +159,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
Ok(m) => {
|
||||
// Since we run one time at startup, be generous in our logging and
|
||||
// dump all metadata.
|
||||
tracing::info!(
|
||||
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||
m.postgres_host,
|
||||
m.postgres_port,
|
||||
m.http_host,
|
||||
m.http_port,
|
||||
m.other
|
||||
);
|
||||
tracing::info!("Loaded node metadata: {m}");
|
||||
|
||||
let az_id = {
|
||||
let az_id_from_metadata = m
|
||||
@@ -195,6 +188,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_grpc_addr: m.grpc_host,
|
||||
listen_grpc_port: m.grpc_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
use posthog_client_lite::{
|
||||
FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
PostHogFlagFilterPropertyValue,
|
||||
};
|
||||
use remote_storage::RemoteStorageKind;
|
||||
use serde_json::json;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -11,11 +15,15 @@ use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
|
||||
#[derive(Clone)]
|
||||
pub struct FeatureResolver {
|
||||
inner: Option<Arc<FeatureResolverBackgroundLoop>>,
|
||||
internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
}
|
||||
|
||||
impl FeatureResolver {
|
||||
pub fn new_disabled() -> Self {
|
||||
Self { inner: None }
|
||||
Self {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
@@ -36,14 +44,142 @@ impl FeatureResolver {
|
||||
shutdown_pageserver,
|
||||
);
|
||||
let inner = Arc::new(inner);
|
||||
// TODO: make this configurable
|
||||
inner.clone().spawn(handle, Duration::from_secs(60));
|
||||
Ok(FeatureResolver { inner: Some(inner) })
|
||||
|
||||
// The properties shared by all tenants on this pageserver.
|
||||
let internal_properties = {
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert(
|
||||
"pageserver_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(conf.id.to_string()),
|
||||
);
|
||||
if let Some(availability_zone) = &conf.availability_zone {
|
||||
properties.insert(
|
||||
"availability_zone".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(availability_zone.clone()),
|
||||
);
|
||||
}
|
||||
// Infer region based on the remote storage config.
|
||||
if let Some(remote_storage) = &conf.remote_storage_config {
|
||||
match &remote_storage.storage {
|
||||
RemoteStorageKind::AwsS3(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"aws-{}",
|
||||
config.bucket_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"azure-{}",
|
||||
config.container_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::LocalFs { .. } => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String("local".to_string()),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: move this to a background task so that we don't block startup in case of slow disk
|
||||
let metadata_path = conf.metadata_path();
|
||||
match std::fs::read_to_string(&metadata_path) {
|
||||
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||
Ok(metadata) => {
|
||||
properties.insert(
|
||||
"hostname".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(metadata.http_host),
|
||||
);
|
||||
if let Some(cplane_region) = metadata.other.get("region_id") {
|
||||
if let Some(cplane_region) = cplane_region.as_str() {
|
||||
// This region contains the cell number
|
||||
properties.insert(
|
||||
"neon_region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(
|
||||
cplane_region.to_string(),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to parse metadata.json: {}", e);
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to read metadata.json: {}", e);
|
||||
}
|
||||
}
|
||||
Arc::new(properties)
|
||||
};
|
||||
let fake_tenants = {
|
||||
let mut tenants = Vec::new();
|
||||
for i in 0..10 {
|
||||
let distinct_id = format!(
|
||||
"fake_tenant_{}_{}_{}",
|
||||
conf.availability_zone.as_deref().unwrap_or_default(),
|
||||
conf.id,
|
||||
i
|
||||
);
|
||||
let properties = Self::collect_properties_inner(
|
||||
distinct_id.clone(),
|
||||
Some(&internal_properties),
|
||||
);
|
||||
tenants.push(CaptureEvent {
|
||||
event: "initial_tenant_report".to_string(),
|
||||
distinct_id,
|
||||
properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties
|
||||
});
|
||||
}
|
||||
tenants
|
||||
};
|
||||
// TODO: make refresh period configurable
|
||||
inner
|
||||
.clone()
|
||||
.spawn(handle, Duration::from_secs(60), fake_tenants);
|
||||
Ok(FeatureResolver {
|
||||
inner: Some(inner),
|
||||
internal_properties: Some(internal_properties),
|
||||
})
|
||||
} else {
|
||||
Ok(FeatureResolver { inner: None })
|
||||
Ok(FeatureResolver {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_properties_inner(
|
||||
tenant_id: String,
|
||||
internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
let mut properties = HashMap::new();
|
||||
if let Some(internal_properties) = internal_properties {
|
||||
for (key, value) in internal_properties.iter() {
|
||||
properties.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
properties.insert(
|
||||
"tenant_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(tenant_id),
|
||||
);
|
||||
properties
|
||||
}
|
||||
|
||||
/// Collect all properties availble for the feature flag evaluation.
|
||||
pub(crate) fn collect_properties(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
|
||||
}
|
||||
|
||||
/// Evaluate a multivariate feature flag. Currently, we do not support any properties.
|
||||
///
|
||||
/// Error handling: the caller should inspect the error and decide the behavior when a feature flag
|
||||
@@ -58,7 +194,7 @@ impl FeatureResolver {
|
||||
let res = inner.feature_store().evaluate_multivariate(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(value) => {
|
||||
@@ -96,7 +232,7 @@ impl FeatureResolver {
|
||||
let res = inner.feature_store().evaluate_boolean(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(()) => {
|
||||
|
||||
@@ -43,6 +43,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde_json::json;
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
@@ -72,6 +73,7 @@ use crate::tenant::remote_timeline_client::{
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
@@ -1450,7 +1452,10 @@ async fn timeline_layer_scan_disposable_keys(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
|
||||
@@ -3679,23 +3684,24 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
|
||||
if as_type == "boolean" {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else if as_type == "multivariate" {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
// Auto infer the type of the feature flag.
|
||||
let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
|
||||
if is_boolean {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1053,6 +1053,15 @@ pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub(crate) static TIMELINE_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_timeline_states_count",
|
||||
"Count of timelines per state",
|
||||
&["state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_timeline_states_count metric")
|
||||
});
|
||||
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
@@ -2855,7 +2864,6 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) values_committed_metadata_images: IntCounter,
|
||||
pub(crate) values_committed_metadata_deltas: IntCounter,
|
||||
pub(crate) values_committed_data_images: IntCounter,
|
||||
@@ -2911,11 +2919,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_filtered: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_filtered",
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
|
||||
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
|
||||
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
|
||||
@@ -3331,6 +3334,8 @@ impl TimelineMetrics {
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -3485,6 +3490,8 @@ impl TimelineMetrics {
|
||||
return;
|
||||
}
|
||||
|
||||
TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec();
|
||||
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -4421,18 +4428,16 @@ pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_entries_total",
|
||||
"Number of entries in the basebackup cache"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// FIXME: Support basebackup cache size metrics.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_size_bytes",
|
||||
"Total size of all basebackup cache entries on disk in bytes"
|
||||
)
|
||||
|
||||
@@ -14,7 +14,7 @@ use std::{io, str};
|
||||
|
||||
use anyhow::{Context as _, anyhow, bail};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::{Buf, BytesMut};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use itertools::Itertools;
|
||||
@@ -169,96 +169,6 @@ pub fn spawn(
|
||||
Listener { cancel, task }
|
||||
}
|
||||
|
||||
/// Spawns a gRPC server for the page service.
|
||||
///
|
||||
/// TODO: move this onto GrpcPageServiceHandler::spawn().
|
||||
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
|
||||
/// need to reimplement the TCP+TLS accept loop ourselves.
|
||||
pub fn spawn_grpc(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<CancellableTask> {
|
||||
let cancel = CancellationToken::new();
|
||||
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch)
|
||||
.detached_child();
|
||||
let gate = Gate::default();
|
||||
|
||||
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
|
||||
// port early during startup.
|
||||
let incoming = {
|
||||
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
|
||||
listener.set_nonblocking(true)?;
|
||||
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(listener)?)
|
||||
.with_nodelay(Some(GRPC_TCP_NODELAY))
|
||||
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
|
||||
};
|
||||
|
||||
// Set up the gRPC server.
|
||||
//
|
||||
// TODO: consider tuning window sizes.
|
||||
let mut server = tonic::transport::Server::builder()
|
||||
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
|
||||
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
|
||||
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
|
||||
|
||||
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
|
||||
//
|
||||
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
|
||||
//
|
||||
// * Layers: allow async code, can run code after the service response. However, only has access
|
||||
// to the raw HTTP request/response, not the gRPC types.
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
};
|
||||
|
||||
let observability_layer = ObservabilityLayer;
|
||||
let mut tenant_interceptor = TenantMetadataInterceptor;
|
||||
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
|
||||
|
||||
let page_service = tower::ServiceBuilder::new()
|
||||
// Create tracing span and record request start time.
|
||||
.layer(observability_layer)
|
||||
// Intercept gRPC requests.
|
||||
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
|
||||
// Extract tenant metadata.
|
||||
req = tenant_interceptor.call(req)?;
|
||||
// Authenticate tenant JWT token.
|
||||
req = auth_interceptor.call(req)?;
|
||||
Ok(req)
|
||||
}))
|
||||
.service(proto::PageServiceServer::new(page_service_handler));
|
||||
let server = server.add_service(page_service);
|
||||
|
||||
// Reflection service for use with e.g. grpcurl.
|
||||
let reflection_service = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()?;
|
||||
let server = server.add_service(reflection_service);
|
||||
|
||||
// Spawn server task.
|
||||
let task_cancel = cancel.clone();
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"grpc listener",
|
||||
async move {
|
||||
let result = server
|
||||
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
|
||||
.await;
|
||||
if result.is_ok() {
|
||||
// TODO: revisit shutdown logic once page service is implemented.
|
||||
gate.close().await;
|
||||
}
|
||||
result
|
||||
},
|
||||
));
|
||||
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
impl Listener {
|
||||
pub async fn stop_accepting(self) -> Connections {
|
||||
self.cancel.cancel();
|
||||
@@ -497,10 +407,6 @@ async fn page_service_conn_main(
|
||||
}
|
||||
|
||||
/// Page service connection handler.
|
||||
///
|
||||
/// TODO: for gRPC, this will be shared by all requests from all connections.
|
||||
/// Decompose it into global state and per-connection/request state, and make
|
||||
/// libpq-specific options (e.g. pipelining) separate.
|
||||
struct PageServerHandler {
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
@@ -717,60 +623,6 @@ enum PageStreamError {
|
||||
BadRequest(Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl PageStreamError {
|
||||
/// Converts a PageStreamError into a proto::GetPageResponse with the appropriate status
|
||||
/// code, or a gRPC status if it should terminate the stream (e.g. shutdown). This is a
|
||||
/// convenience method for use from a get_pages gRPC stream.
|
||||
#[allow(clippy::result_large_err)]
|
||||
fn into_get_page_response(
|
||||
self,
|
||||
request_id: page_api::RequestID,
|
||||
) -> Result<proto::GetPageResponse, tonic::Status> {
|
||||
use page_api::GetPageStatusCode;
|
||||
use tonic::Code;
|
||||
|
||||
// We dispatch to Into<tonic::Status> first, and then map it to a GetPageResponse.
|
||||
let status: tonic::Status = self.into();
|
||||
let status_code = match status.code() {
|
||||
// We shouldn't see an OK status here, because we're emitting an error.
|
||||
Code::Ok => {
|
||||
debug_assert_ne!(status.code(), Code::Ok);
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected OK status: {status:?}",
|
||||
)));
|
||||
}
|
||||
|
||||
// These are per-request errors, returned as GetPageResponses.
|
||||
Code::AlreadyExists => GetPageStatusCode::InvalidRequest,
|
||||
Code::DataLoss => GetPageStatusCode::InternalError,
|
||||
Code::FailedPrecondition => GetPageStatusCode::InvalidRequest,
|
||||
Code::InvalidArgument => GetPageStatusCode::InvalidRequest,
|
||||
Code::Internal => GetPageStatusCode::InternalError,
|
||||
Code::NotFound => GetPageStatusCode::NotFound,
|
||||
Code::OutOfRange => GetPageStatusCode::InvalidRequest,
|
||||
Code::ResourceExhausted => GetPageStatusCode::SlowDown,
|
||||
|
||||
// These should terminate the stream.
|
||||
Code::Aborted => return Err(status),
|
||||
Code::Cancelled => return Err(status),
|
||||
Code::DeadlineExceeded => return Err(status),
|
||||
Code::PermissionDenied => return Err(status),
|
||||
Code::Unauthenticated => return Err(status),
|
||||
Code::Unavailable => return Err(status),
|
||||
Code::Unimplemented => return Err(status),
|
||||
Code::Unknown => return Err(status),
|
||||
};
|
||||
|
||||
Ok(page_api::GetPageResponse {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageStreamError> for tonic::Status {
|
||||
fn from(err: PageStreamError) -> Self {
|
||||
use tonic::Code;
|
||||
@@ -3362,9 +3214,106 @@ where
|
||||
pub struct GrpcPageServiceHandler {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
ctx: RequestContext,
|
||||
gate_guard: GateGuard,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
}
|
||||
|
||||
impl GrpcPageServiceHandler {
|
||||
/// Spawns a gRPC server for the page service.
|
||||
///
|
||||
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
|
||||
/// need to reimplement the TCP+TLS accept loop ourselves.
|
||||
pub fn spawn(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<CancellableTask> {
|
||||
let cancel = CancellationToken::new();
|
||||
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch)
|
||||
.detached_child();
|
||||
let gate = Gate::default();
|
||||
|
||||
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
|
||||
// port early during startup.
|
||||
let incoming = {
|
||||
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
|
||||
listener.set_nonblocking(true)?;
|
||||
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(
|
||||
listener,
|
||||
)?)
|
||||
.with_nodelay(Some(GRPC_TCP_NODELAY))
|
||||
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
|
||||
};
|
||||
|
||||
// Set up the gRPC server.
|
||||
//
|
||||
// TODO: consider tuning window sizes.
|
||||
let mut server = tonic::transport::Server::builder()
|
||||
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
|
||||
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
|
||||
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
|
||||
|
||||
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
|
||||
//
|
||||
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
|
||||
//
|
||||
// * Layers: allow async code, can run code after the service response. However, only has access
|
||||
// to the raw HTTP request/response, not the gRPC types.
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
gate_guard: gate.enter().expect("gate was just created"),
|
||||
get_vectored_concurrent_io,
|
||||
};
|
||||
|
||||
let observability_layer = ObservabilityLayer;
|
||||
let mut tenant_interceptor = TenantMetadataInterceptor;
|
||||
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
|
||||
|
||||
let page_service = tower::ServiceBuilder::new()
|
||||
// Create tracing span and record request start time.
|
||||
.layer(observability_layer)
|
||||
// Intercept gRPC requests.
|
||||
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
|
||||
// Extract tenant metadata.
|
||||
req = tenant_interceptor.call(req)?;
|
||||
// Authenticate tenant JWT token.
|
||||
req = auth_interceptor.call(req)?;
|
||||
Ok(req)
|
||||
}))
|
||||
// Run the page service.
|
||||
.service(proto::PageServiceServer::new(page_service_handler));
|
||||
let server = server.add_service(page_service);
|
||||
|
||||
// Reflection service for use with e.g. grpcurl.
|
||||
let reflection_service = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()?;
|
||||
let server = server.add_service(reflection_service);
|
||||
|
||||
// Spawn server task.
|
||||
let task_cancel = cancel.clone();
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"grpc listener",
|
||||
async move {
|
||||
let result = server
|
||||
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
|
||||
.await;
|
||||
if result.is_ok() {
|
||||
// TODO: revisit shutdown logic once page service is implemented.
|
||||
gate.close().await;
|
||||
}
|
||||
result
|
||||
},
|
||||
));
|
||||
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
|
||||
/// relations and their sizes, as well as SLRU segments and similar data.
|
||||
#[allow(clippy::result_large_err)]
|
||||
@@ -3435,8 +3384,8 @@ impl GrpcPageServiceHandler {
|
||||
|
||||
/// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC.
|
||||
///
|
||||
/// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse
|
||||
/// with an appropriate status code instead.
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
@@ -3453,7 +3402,7 @@ impl GrpcPageServiceHandler {
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
let req: page_api::GetPageRequest = req.try_into()?;
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
span_record!(
|
||||
req_id = %req.request_id,
|
||||
@@ -3464,7 +3413,7 @@ impl GrpcPageServiceHandler {
|
||||
);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
|
||||
let effective_lsn = match PageServerHandler::effective_request_lsn(
|
||||
let effective_lsn = PageServerHandler::effective_request_lsn(
|
||||
&timeline,
|
||||
timeline.get_last_record_lsn(),
|
||||
req.read_lsn.request_lsn,
|
||||
@@ -3472,10 +3421,7 @@ impl GrpcPageServiceHandler {
|
||||
.not_modified_since_lsn
|
||||
.unwrap_or(req.read_lsn.request_lsn),
|
||||
&latest_gc_cutoff_lsn,
|
||||
) {
|
||||
Ok(lsn) => lsn,
|
||||
Err(err) => return err.into_get_page_response(req.request_id),
|
||||
};
|
||||
)?;
|
||||
|
||||
let mut batch = SmallVec::with_capacity(req.block_numbers.len());
|
||||
for blkno in req.block_numbers {
|
||||
@@ -3532,7 +3478,7 @@ impl GrpcPageServiceHandler {
|
||||
"unexpected response: {resp:?}"
|
||||
)));
|
||||
}
|
||||
Err(err) => return err.err.into_get_page_response(req.request_id),
|
||||
Err(err) => return Err(err.err.into()),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3598,42 +3544,44 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and wait for the LSN to arrive.
|
||||
//
|
||||
// TODO: this requires a read LSN, is that ok?
|
||||
// Validate the request and decorate the span.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
|
||||
let req: page_api::GetBaseBackupRequest = req.into_inner().into();
|
||||
|
||||
span_record!(lsn=%req.read_lsn);
|
||||
span_record!(lsn=?req.lsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
timeline
|
||||
.wait_lsn(
|
||||
req.read_lsn.request_lsn,
|
||||
WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn)
|
||||
.map_err(|err| {
|
||||
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
|
||||
})?;
|
||||
// Wait for the LSN to arrive, if given.
|
||||
if let Some(lsn) = req.lsn {
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.map_err(|err| {
|
||||
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
|
||||
})?;
|
||||
}
|
||||
|
||||
// Spawn a task to run the basebackup.
|
||||
//
|
||||
// TODO: do we need to support full base backups, for debugging?
|
||||
// TODO: do we need to support full base backups, for debugging? This also requires passing
|
||||
// the prev_lsn parameter.
|
||||
let span = Span::current();
|
||||
let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
|
||||
let jh = tokio::spawn(async move {
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(req.read_lsn.request_lsn),
|
||||
req.lsn,
|
||||
None,
|
||||
false,
|
||||
req.replica,
|
||||
@@ -3649,20 +3597,21 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
|
||||
// Emit chunks of size CHUNK_SIZE.
|
||||
let chunks = async_stream::try_stream! {
|
||||
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE);
|
||||
loop {
|
||||
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
|
||||
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
|
||||
})?;
|
||||
|
||||
// If we read 0 bytes, either the chunk is full or the stream is closed.
|
||||
if n == 0 {
|
||||
if chunk.is_empty() {
|
||||
break;
|
||||
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE);
|
||||
loop {
|
||||
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
|
||||
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
|
||||
})?;
|
||||
if n == 0 {
|
||||
break; // full chunk or closed stream
|
||||
}
|
||||
yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze());
|
||||
chunk.clear();
|
||||
}
|
||||
let chunk = chunk.into_inner().freeze();
|
||||
if chunk.is_empty() {
|
||||
break;
|
||||
}
|
||||
yield proto::GetBaseBackupResponseChunk::from(chunk);
|
||||
}
|
||||
// Wait for the basebackup task to exit and check for errors.
|
||||
jh.await.map_err(|err| {
|
||||
@@ -3721,6 +3670,14 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?;
|
||||
|
||||
// Spawn an IoConcurrency sidecar, if enabled.
|
||||
let Ok(gate_guard) = self.gate_guard.try_clone() else {
|
||||
return Err(tonic::Status::unavailable("shutting down"));
|
||||
};
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard);
|
||||
|
||||
// Spawn a task to handle the GetPageRequest stream.
|
||||
let span = Span::current();
|
||||
let ctx = self.ctx.attached_child();
|
||||
let mut reqs = req.into_inner();
|
||||
@@ -3731,10 +3688,16 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.await?
|
||||
.downgrade();
|
||||
while let Some(req) = reqs.message().await? {
|
||||
// TODO: implement IoConcurrency sidecar.
|
||||
yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential)
|
||||
let req_id = req.request_id;
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await?
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status.
|
||||
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
|
||||
use timeline::import_pgdata::ImportingTimeline;
|
||||
use timeline::layer_manager::LayerManagerLockHolder;
|
||||
use timeline::offload::{OffloadError, offload_timeline};
|
||||
use timeline::{
|
||||
CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
|
||||
@@ -89,7 +90,8 @@ use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::{
|
||||
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
|
||||
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC,
|
||||
remove_tenant_metrics,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
@@ -99,7 +101,6 @@ use crate::tenant::remote_timeline_client::{
|
||||
INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path,
|
||||
};
|
||||
use crate::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use crate::tenant::timeline::CheckOtherForCancel;
|
||||
use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
@@ -545,6 +546,28 @@ pub struct OffloadedTimeline {
|
||||
|
||||
/// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
|
||||
pub deleted_from_ancestor: AtomicBool,
|
||||
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard,
|
||||
}
|
||||
|
||||
/// Increases the offloaded timeline count metric when created, and decreases when dropped.
|
||||
struct OffloadedTimelineMetricsGuard;
|
||||
|
||||
impl OffloadedTimelineMetricsGuard {
|
||||
fn new() -> Self {
|
||||
TIMELINE_STATE_METRIC
|
||||
.with_label_values(&["offloaded"])
|
||||
.inc();
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OffloadedTimelineMetricsGuard {
|
||||
fn drop(&mut self) {
|
||||
TIMELINE_STATE_METRIC
|
||||
.with_label_values(&["offloaded"])
|
||||
.dec();
|
||||
}
|
||||
}
|
||||
|
||||
impl OffloadedTimeline {
|
||||
@@ -577,6 +600,8 @@ impl OffloadedTimeline {
|
||||
|
||||
delete_progress: timeline.delete_progress.clone(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
|
||||
})
|
||||
}
|
||||
fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
|
||||
@@ -596,6 +621,7 @@ impl OffloadedTimeline {
|
||||
archived_at,
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
|
||||
}
|
||||
}
|
||||
fn manifest(&self) -> OffloadedTimelineManifest {
|
||||
@@ -1290,7 +1316,7 @@ impl TenantShard {
|
||||
ancestor.is_some()
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::LoadLayerMap)
|
||||
.await
|
||||
.layer_map()
|
||||
.expect(
|
||||
@@ -2618,7 +2644,7 @@ impl TenantShard {
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
@@ -3133,7 +3159,12 @@ impl TenantShard {
|
||||
|
||||
for timeline in &compact {
|
||||
// Collect L0 counts. Can't await while holding lock above.
|
||||
if let Ok(lm) = timeline.layers.read().await.layer_map() {
|
||||
if let Ok(lm) = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await
|
||||
.layer_map()
|
||||
{
|
||||
l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
|
||||
}
|
||||
}
|
||||
@@ -3262,11 +3293,11 @@ impl TenantShard {
|
||||
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
if err.is_cancel(CheckOtherForCancel::No /* XXX flip this to Yes so that all the Other() errors that are cancel don't trip the circuit breaker? */) {
|
||||
return;
|
||||
}
|
||||
match err {
|
||||
CompactionError::ShuttingDown => unreachable!("is_cancel"),
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
|
||||
@@ -3281,7 +3312,7 @@ impl TenantShard {
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::AlreadyRunning(_) => unreachable!("is_cancel, but XXX why?"),
|
||||
CompactionError::AlreadyRunning(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4875,7 +4906,7 @@ impl TenantShard {
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
@@ -6945,7 +6976,7 @@ mod tests {
|
||||
.await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
|
||||
|
||||
let layer_map = tline.layers.read().await;
|
||||
let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let level0_deltas = layer_map
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
@@ -7181,7 +7212,7 @@ mod tests {
|
||||
let lsn = Lsn(0x10);
|
||||
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let guard = tline.layers.read().await;
|
||||
let guard = tline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let lm = guard.layer_map()?;
|
||||
|
||||
lm.dump(true, &ctx).await?;
|
||||
@@ -8209,12 +8240,23 @@ mod tests {
|
||||
tline.freeze_and_flush().await?; // force create a delta layer
|
||||
}
|
||||
|
||||
let before_num_l0_delta_files =
|
||||
tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
let before_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.len();
|
||||
|
||||
tline.compact(&cancel, EnumSet::default(), &ctx).await?;
|
||||
|
||||
let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
let after_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.len();
|
||||
|
||||
assert!(
|
||||
after_num_l0_delta_files < before_num_l0_delta_files,
|
||||
|
||||
@@ -61,8 +61,8 @@ pub(crate) struct LocationConf {
|
||||
/// The detailed shard identity. This structure is already scoped within
|
||||
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
|
||||
/// key->shard mappings.
|
||||
// TODO(vlad): Remove this default once all configs have a shard identity on disk.
|
||||
#[serde(default = "ShardIdentity::unsharded")]
|
||||
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
|
||||
pub(crate) shard: ShardIdentity,
|
||||
|
||||
/// The pan-cluster tenant configuration, the same on all locations
|
||||
@@ -149,7 +149,12 @@ impl LocationConf {
|
||||
/// For use when attaching/re-attaching: update the generation stored in this
|
||||
/// structure. If we were in a secondary state, promote to attached (posession
|
||||
/// of a fresh generation implies this).
|
||||
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||
pub(crate) fn attach_in_generation(
|
||||
&mut self,
|
||||
mode: AttachmentMode,
|
||||
generation: Generation,
|
||||
stripe_size: ShardStripeSize,
|
||||
) {
|
||||
match &mut self.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
attach_conf.generation = generation;
|
||||
@@ -163,6 +168,8 @@ impl LocationConf {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
self.shard.stripe_size = stripe_size;
|
||||
}
|
||||
|
||||
pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,7 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use std::io::{ErrorKind, SeekFrom};
|
||||
use std::num::NonZeroU32;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
@@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let timelines_path = super::remote_timelines_path(tenant_shard_id);
|
||||
prefixes.push(timelines_path);
|
||||
}
|
||||
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
|
||||
|
||||
for prefix in &prefixes {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
storage
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(
|
||||
Some(prefix),
|
||||
timestamp,
|
||||
done_if_after,
|
||||
cancel,
|
||||
COMPLEXITY_LIMIT,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|e| !matches!(e, TimeTravelError::Other(_)),
|
||||
|
||||
@@ -55,11 +55,11 @@ pub struct BatchLayerWriter {
|
||||
}
|
||||
|
||||
impl BatchLayerWriter {
|
||||
pub fn new(conf: &'static PageServerConf) -> Self {
|
||||
Self {
|
||||
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
generated_layer_writers: Vec::new(),
|
||||
conf,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_unfinished_image_writer(
|
||||
@@ -209,7 +209,6 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
// XXX make this lazy like in SplitDeltaLayerWriter?
|
||||
inner: ImageLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
@@ -224,7 +223,7 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
batches: BatchLayerWriter::new(conf),
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
lsn,
|
||||
start_key,
|
||||
gate,
|
||||
@@ -320,7 +319,7 @@ pub struct SplitDeltaLayerWriter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
pub fn new(
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -328,8 +327,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: None,
|
||||
conf,
|
||||
@@ -337,10 +336,10 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
batches: BatchLayerWriter::new(conf),
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
gate,
|
||||
cancel,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
@@ -511,7 +510,9 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
);
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
@@ -589,7 +590,9 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
);
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
const N: usize = 2000;
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
@@ -689,7 +692,9 @@ mod tests {
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
);
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
@@ -765,7 +770,9 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
);
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
|
||||
@@ -1635,6 +1635,7 @@ pub(crate) mod test {
|
||||
use crate::tenant::disk_btree::tests::TestDisk;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
@@ -2002,7 +2003,7 @@ pub(crate) mod test {
|
||||
|
||||
let initdb_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.next()
|
||||
@@ -2078,7 +2079,7 @@ pub(crate) mod test {
|
||||
|
||||
let new_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.find(|&x| x != &initdb_layer)
|
||||
|
||||
@@ -10,6 +10,7 @@ use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
|
||||
@@ -59,7 +60,7 @@ async fn smoke_test() {
|
||||
// there to avoid the timeline being illegally empty
|
||||
let (layer, dummy_layer) = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -215,7 +216,7 @@ async fn smoke_test() {
|
||||
|
||||
// Simulate GC removing our test layer.
|
||||
{
|
||||
let mut g = timeline.layers.write().await;
|
||||
let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
|
||||
let layers = &[layer];
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
@@ -261,7 +262,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -305,7 +306,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// assert that once we remove the `layer` from the layer map and drop our reference,
|
||||
// the deletion of the layer in remote_storage happens.
|
||||
{
|
||||
let mut layers = timeline.layers.write().await;
|
||||
let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
@@ -347,7 +348,7 @@ fn read_wins_pending_eviction() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -480,7 +481,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -655,7 +656,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -741,7 +742,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -862,7 +863,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
|
||||
let (evicted_layer, not_evicted) = {
|
||||
let mut layers = {
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
|
||||
// remove the layers from layermap
|
||||
guard.open_mut().unwrap().finish_gc_timeline(&layers);
|
||||
|
||||
@@ -22,8 +22,8 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::timeline::{CheckOtherForCancel, CompactionError};
|
||||
use crate::tenant::{TenantShard, TenantState};
|
||||
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
@@ -292,12 +292,35 @@ pub(crate) fn log_compaction_error(
|
||||
task_cancelled: bool,
|
||||
degrade_to_warning: bool,
|
||||
) {
|
||||
let is_cancel = err.is_cancel(CheckOtherForCancel::Yes);
|
||||
use CompactionError::*;
|
||||
|
||||
let level = if is_cancel || task_cancelled {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let is_stopping = upload_queue || timeline;
|
||||
|
||||
if is_stopping {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if let Some((error_count, sleep_duration)) = retry_info {
|
||||
|
||||
@@ -35,7 +35,11 @@ use fail::fail_point;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use handle::ShardTimelineId;
|
||||
use layer_manager::Shutdown;
|
||||
use layer_manager::{
|
||||
LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager,
|
||||
Shutdown,
|
||||
};
|
||||
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
@@ -75,14 +79,13 @@ use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::seqwait::SeqWait;
|
||||
use utils::simple_rcu::{Rcu, RcuReadGuard};
|
||||
use utils::sync::gate::{Gate, GateError, GateGuard};
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::{completion, critical, fs_ext, pausable_failpoint};
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
@@ -116,7 +119,6 @@ use crate::pgdatadir_mapping::{
|
||||
MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::blob_io::WriteBlobError;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
@@ -131,7 +133,6 @@ use crate::tenant::storage_layer::{
|
||||
};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use crate::walingest::WalLagCooldown;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
@@ -183,13 +184,13 @@ impl std::fmt::Display for ImageLayerCreationMode {
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||
fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
@@ -243,7 +244,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
|
||||
pub(crate) layers: LockedLayerManager,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -762,7 +763,7 @@ pub(crate) enum CreateImageLayersError {
|
||||
PageReconstructError(#[source] PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<layer_manager::Shutdown> for CreateImageLayersError {
|
||||
@@ -1057,8 +1058,8 @@ pub(crate) enum WaitLsnWaiter<'a> {
|
||||
/// Argument to [`Timeline::shutdown`].
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum ShutdownMode {
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk. This method can
|
||||
/// take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
@@ -1537,7 +1538,10 @@ impl Timeline {
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub(crate) async fn layer_size_sum(&self) -> u64 {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
guard.layer_size_sum()
|
||||
}
|
||||
|
||||
@@ -1847,7 +1851,7 @@ impl Timeline {
|
||||
// time, and this was missed.
|
||||
// if write_guard.is_none() { return; }
|
||||
|
||||
let Ok(layers_guard) = self.layers.try_read() else {
|
||||
let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else {
|
||||
// Don't block if the layer lock is busy
|
||||
return;
|
||||
};
|
||||
@@ -2063,10 +2067,9 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
||||
// XXX this looks an awful lot like the circuit breaker code? Can we dedupe classification?
|
||||
match &result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(e) if e.is_cancel(CheckOtherForCancel::No /* XXX flip this to Yes so that all the Other() errors that are cancel don't trip the circuit breaker? */) => {}
|
||||
Err(e) if e.is_cancel() => {}
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
@@ -2161,7 +2164,7 @@ impl Timeline {
|
||||
if let ShutdownMode::FreezeAndFlush = mode {
|
||||
let do_flush = if let Some((open, frozen)) = self
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Shutdown)
|
||||
.await
|
||||
.layer_map()
|
||||
.map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
|
||||
@@ -2265,7 +2268,10 @@ impl Timeline {
|
||||
// Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
|
||||
// open.
|
||||
let mut write_guard = self.write_lock.lock().await;
|
||||
self.layers.write().await.shutdown(&mut write_guard);
|
||||
self.layers
|
||||
.write(LayerManagerLockHolder::Shutdown)
|
||||
.await
|
||||
.shutdown(&mut write_guard);
|
||||
}
|
||||
|
||||
// Finally wait until any gate-holders are complete.
|
||||
@@ -2368,7 +2374,10 @@ impl Timeline {
|
||||
&self,
|
||||
reset: LayerAccessStatsReset,
|
||||
) -> Result<LayerMapInfo, layer_manager::Shutdown> {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
if let Some(open_layer) = &layer_map.open_layer {
|
||||
@@ -2509,6 +2518,13 @@ impl Timeline {
|
||||
// Preparing basebackup doesn't make sense for shards other than shard zero.
|
||||
return;
|
||||
}
|
||||
if !self.is_active() {
|
||||
// May happen during initial timeline creation.
|
||||
// Such timeline is not in the global timeline map yet,
|
||||
// so basebackup cache will not be able to find it.
|
||||
// TODO(diko): We can prepare such timelines in finish_creation().
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
@@ -2848,21 +2864,6 @@ impl Timeline {
|
||||
)
|
||||
}
|
||||
|
||||
/// Resolve the effective WAL receiver protocol to use for this tenant.
|
||||
///
|
||||
/// Priority order is:
|
||||
/// 1. Tenant config override
|
||||
/// 2. Default value for tenant config override
|
||||
/// 3. Pageserver config override
|
||||
/// 4. Pageserver config default
|
||||
pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.wal_receiver_protocol_override
|
||||
.or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
|
||||
.unwrap_or(self.conf.wal_receiver_protocol)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -3218,10 +3219,16 @@ impl Timeline {
|
||||
guard.is_none(),
|
||||
"multiple launches / re-launches of WAL receiver are not supported"
|
||||
);
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
*guard = Some(WalReceiver::start(
|
||||
Arc::clone(self),
|
||||
WalReceiverConf {
|
||||
protocol: self.resolve_wal_receiver_protocol(),
|
||||
protocol,
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
@@ -3237,7 +3244,7 @@ impl Timeline {
|
||||
|
||||
/// Initialize with an empty layer map. Used when creating a new timeline.
|
||||
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
|
||||
let mut layers = self.layers.try_write().expect(
|
||||
let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect(
|
||||
"in the context where we call this function, no other task has access to the object",
|
||||
);
|
||||
layers
|
||||
@@ -3257,7 +3264,10 @@ impl Timeline {
|
||||
use init::Decision::*;
|
||||
use init::{Discovered, DismissedLayer};
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::LoadLayerMap)
|
||||
.await;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
|
||||
@@ -3874,7 +3884,10 @@ impl Timeline {
|
||||
&self,
|
||||
layer_name: &LayerName,
|
||||
) -> Result<Option<Layer>, layer_manager::Shutdown> {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer = guard
|
||||
.layer_map()?
|
||||
.iter_historic_layers()
|
||||
@@ -3907,7 +3920,10 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GenerateHeatmap)
|
||||
.await;
|
||||
|
||||
// Firstly, if there's any heatmap left over from when this location
|
||||
// was a secondary, take that into account. Keep layers that are:
|
||||
@@ -4005,7 +4021,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GenerateHeatmap)
|
||||
.await;
|
||||
|
||||
let now = SystemTime::now();
|
||||
let mut heatmap_layers = Vec::default();
|
||||
@@ -4347,7 +4366,7 @@ impl Timeline {
|
||||
query: &VersionedKeySpaceQuery,
|
||||
) -> Result<LayerFringe, GetVectoredError> {
|
||||
let mut fringe = LayerFringe::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::GetPage).await;
|
||||
|
||||
match query {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
@@ -4450,7 +4469,7 @@ impl Timeline {
|
||||
// required for correctness, but avoids visiting extra layers
|
||||
// which turns out to be a perf bottleneck in some cases.
|
||||
if !unmapped_keyspace.is_empty() {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await;
|
||||
guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
@@ -4560,7 +4579,10 @@ impl Timeline {
|
||||
_guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GetLayerForWrite)
|
||||
.await;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
ensure!(
|
||||
@@ -4602,7 +4624,10 @@ impl Timeline {
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) -> Result<u64, FlushLayerError> {
|
||||
let frozen = {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::TryFreezeLayer)
|
||||
.await;
|
||||
guard
|
||||
.open_mut()?
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
|
||||
@@ -4643,7 +4668,12 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
// Subscribe to L0 delta layer updates, for compaction backpressure.
|
||||
let mut watch_l0 = match self.layers.read().await.layer_map() {
|
||||
let mut watch_l0 = match self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::FlushLoop)
|
||||
.await
|
||||
.layer_map()
|
||||
{
|
||||
Ok(lm) => lm.watch_level0_deltas(),
|
||||
Err(Shutdown) => return,
|
||||
};
|
||||
@@ -4680,7 +4710,7 @@ impl Timeline {
|
||||
|
||||
// Fetch the next layer to flush, if any.
|
||||
let (layer, l0_count, frozen_count, frozen_size) = {
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
|
||||
let Ok(lm) = layers.layer_map() else {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
@@ -4976,7 +5006,10 @@ impl Timeline {
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::FlushFrozenLayer)
|
||||
.await;
|
||||
|
||||
guard.open_mut()?.finish_flush_l0_layer(
|
||||
delta_layer_to_add.as_ref(),
|
||||
@@ -5191,7 +5224,7 @@ impl Timeline {
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let Ok(layers) = guard.layer_map() else {
|
||||
return false;
|
||||
};
|
||||
@@ -5533,7 +5566,7 @@ impl Timeline {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf);
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
|
||||
|
||||
let mut all_generated = true;
|
||||
|
||||
@@ -5609,7 +5642,7 @@ impl Timeline {
|
||||
if let ImageLayerCreationMode::Force = mode {
|
||||
// When forced to create image layers, we might try and create them where they already
|
||||
// exist. This mode is only used in tests/debug.
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
if layers.contains_key(&PersistentLayerKey {
|
||||
key_range: img_range.clone(),
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
@@ -5637,8 +5670,7 @@ impl Timeline {
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
@@ -5733,12 +5765,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let image_layers = batch_image_writer
|
||||
.finish(self, ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
let image_layers = batch_image_writer.finish(self, ctx).await?;
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
|
||||
|
||||
// FIXME: we could add the images to be uploaded *before* returning from here, but right
|
||||
// now they are being scheduled outside of write lock; current way is inconsistent with
|
||||
@@ -5746,7 +5775,7 @@ impl Timeline {
|
||||
guard
|
||||
.open_mut()?
|
||||
.track_new_image_layers(&image_layers, &self.metrics);
|
||||
drop_wlock(guard);
|
||||
drop_layer_manager_wlock(guard);
|
||||
let duration = timer.stop_and_record();
|
||||
|
||||
// Creating image layers may have caused some previously visible layers to be covered
|
||||
@@ -5938,61 +5967,19 @@ pub(crate) enum CompactionError {
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
|
||||
/// Whether [`CompactionError::is_cancel`] should inspect the
|
||||
/// [`CompactionError::Other`] anyhow Error's root cause for
|
||||
/// typical causes of cancellation.
|
||||
pub(crate) enum CheckOtherForCancel {
|
||||
No,
|
||||
Yes,
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self, check_other: CheckOtherForCancel) -> bool {
|
||||
if matches!(
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_) // XXX why do we treat AlreadyRunning as cancel?
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
))
|
||||
| Self::Offload(OffloadError::Cancelled)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
let root_cause = match &check_other {
|
||||
CheckOtherForCancel::No => return false,
|
||||
CheckOtherForCancel::Yes => {
|
||||
if let Self::Other(other) = self {
|
||||
other.root_cause()
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed
|
||||
)
|
||||
}
|
||||
|
||||
/// Critical errors that indicate data corruption.
|
||||
@@ -6158,7 +6145,7 @@ impl Timeline {
|
||||
layers_to_remove: &[Layer],
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut guard = tokio::select! {
|
||||
guard = self.layers.write() => guard,
|
||||
guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
@@ -6207,7 +6194,7 @@ impl Timeline {
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&remove_layers, new_deltas)?;
|
||||
|
||||
drop_wlock(guard);
|
||||
drop_layer_manager_wlock(guard);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6217,7 +6204,7 @@ impl Timeline {
|
||||
mut replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
mut drop_layers: Vec<Layer>,
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
|
||||
|
||||
// Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
|
||||
// to avoid double-removing, and avoid rewriting something that was removed.
|
||||
@@ -6568,7 +6555,10 @@ impl Timeline {
|
||||
// 5. newer on-disk image layers cover the layer's whole key range
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layers = guard.layer_map()?;
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
@@ -6870,7 +6860,10 @@ impl Timeline {
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskState;
|
||||
|
||||
let remaining = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let Ok(lm) = guard.layer_map() else {
|
||||
// technically here we could look into iterating accessible layers, but downloading
|
||||
// all layers of a shutdown timeline makes no sense regardless.
|
||||
@@ -6976,7 +6969,7 @@ impl Timeline {
|
||||
impl Timeline {
|
||||
/// Returns non-remote layers for eviction.
|
||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
|
||||
let resident_layers = guard
|
||||
@@ -7077,7 +7070,7 @@ impl Timeline {
|
||||
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("force created image layer {}", image_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
@@ -7140,7 +7133,7 @@ impl Timeline {
|
||||
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("force created delta layer {}", delta_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
@@ -7235,7 +7228,7 @@ impl Timeline {
|
||||
|
||||
// Link the layer to the layer map
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
let layer_map = guard.open_mut().unwrap();
|
||||
layer_map.force_insert_in_memory_layer(Arc::new(layer));
|
||||
}
|
||||
@@ -7252,7 +7245,7 @@ impl Timeline {
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> anyhow::Result<Vec<(Key, Bytes)>> {
|
||||
let mut all_data = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
@@ -7281,7 +7274,7 @@ impl Timeline {
|
||||
self: &Arc<Timeline>,
|
||||
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
|
||||
let mut layers = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
layers.push(layer.key());
|
||||
}
|
||||
@@ -7393,7 +7386,7 @@ impl TimelineWriter<'_> {
|
||||
let l0_count = self
|
||||
.tl
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
@@ -7612,6 +7605,7 @@ mod tests {
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError};
|
||||
use crate::tenant::{PreviousHeatmap, Timeline};
|
||||
|
||||
@@ -7719,7 +7713,7 @@ mod tests {
|
||||
// Evict all the layers and stash the old heatmap in the timeline.
|
||||
// This simulates a migration to a cold secondary location.
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let mut all_layers = Vec::new();
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
@@ -7841,7 +7835,7 @@ mod tests {
|
||||
})));
|
||||
|
||||
// Evict all the layers in the previous heatmap
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
layer.evict_and_wait(forever).await.unwrap();
|
||||
@@ -7904,7 +7898,10 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn find_some_layer(timeline: &Timeline) -> Layer {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let desc = layers
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Timeline;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct RangeAnalysis {
|
||||
@@ -24,7 +25,10 @@ impl Timeline {
|
||||
|
||||
let num_of_l0;
|
||||
let all_layer_files = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
|
||||
guard.all_persistent_layers()
|
||||
};
|
||||
|
||||
@@ -9,11 +9,11 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
|
||||
use super::{
|
||||
CheckOtherForCancel, CompactFlags, CompactOptions, CompactionError, CreateImageLayersError,
|
||||
DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus,
|
||||
RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
|
||||
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
@@ -62,7 +62,7 @@ use crate::tenant::storage_layer::{
|
||||
use crate::tenant::tasks::log_compaction_error;
|
||||
use crate::tenant::timeline::{
|
||||
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
|
||||
ResidentLayer, drop_rlock,
|
||||
ResidentLayer, drop_layer_manager_rlock,
|
||||
};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
@@ -314,7 +314,10 @@ impl GcCompactionQueue {
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
|
||||
let layers = {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -408,7 +411,10 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> Result<u64, CompactionError> {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let layers = layer_map.iter_historic_layers().collect_vec();
|
||||
let mut size = 0;
|
||||
@@ -851,7 +857,7 @@ impl KeyHistoryRetention {
|
||||
}
|
||||
let layer_generation;
|
||||
{
|
||||
let guard = tline.layers.read().await;
|
||||
let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
if !guard.contains_key(key) {
|
||||
return false;
|
||||
}
|
||||
@@ -1282,7 +1288,10 @@ impl Timeline {
|
||||
// We do the repartition on the L0-L1 boundary. All data below the boundary
|
||||
// are compacted by L0 with low read amplification, thus making the `repartition`
|
||||
// function run fast.
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
guard
|
||||
.all_persistent_layers()
|
||||
.iter()
|
||||
@@ -1396,7 +1405,7 @@ impl Timeline {
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
Err(err) if err.is_cancel(CheckOtherForCancel::No) => {}
|
||||
Err(err) if err.is_cancel() => {}
|
||||
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
Err(err) if err.is_critical() => {
|
||||
@@ -1461,7 +1470,7 @@ impl Timeline {
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layers_iter = layers.layer_map()?.iter_historic_layers();
|
||||
let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
|
||||
for layer_desc in layers_iter {
|
||||
@@ -1722,7 +1731,10 @@ impl Timeline {
|
||||
// are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
|
||||
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
|
||||
// they will be subject to L0->L1 compaction in the near future.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_manager = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = layer_manager.layer_map()?;
|
||||
|
||||
let readable_points = {
|
||||
@@ -1775,7 +1787,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let phase1_layers_locked = self.layers.read().await;
|
||||
let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
@@ -1803,7 +1815,7 @@ impl Timeline {
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
|
||||
async fn compact_level0_phase1<'a>(
|
||||
self: &'a Arc<Self>,
|
||||
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
|
||||
guard: LayerManagerReadGuard<'a>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
@@ -2029,7 +2041,7 @@ impl Timeline {
|
||||
holes
|
||||
};
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
drop_rlock(guard);
|
||||
drop_layer_manager_rlock(guard);
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
@@ -2469,7 +2481,7 @@ impl Timeline {
|
||||
|
||||
// Find the top of the historical layers
|
||||
let end_lsn = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
let l0_deltas = layers.level0_deltas();
|
||||
@@ -3008,7 +3020,7 @@ impl Timeline {
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -3112,12 +3124,12 @@ impl Timeline {
|
||||
.await?;
|
||||
let jobs_len = jobs.len();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
info!(
|
||||
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
|
||||
idx + 1,
|
||||
jobs_len
|
||||
);
|
||||
let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len);
|
||||
self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
|
||||
.instrument(info_span!(
|
||||
"sub_compaction",
|
||||
sub_compaction_progress = sub_compaction_progress
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
if jobs_len == 0 {
|
||||
@@ -3185,7 +3197,10 @@ impl Timeline {
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let job_desc = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
@@ -3516,7 +3531,10 @@ impl Timeline {
|
||||
self.get_compaction_target_size(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
);
|
||||
)
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RewritingLayers {
|
||||
@@ -3953,7 +3971,10 @@ impl Timeline {
|
||||
|
||||
// First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -4017,7 +4038,10 @@ impl Timeline {
|
||||
let update_guard = self.gc_compaction_layer_update_lock.write().await;
|
||||
// Acquiring the update guard ensures current read operations end and new read operations are blocked.
|
||||
// TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
guard
|
||||
.open_mut()?
|
||||
.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
|
||||
@@ -4085,7 +4109,11 @@ impl TimelineAdaptor {
|
||||
|
||||
pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
|
||||
let layers_to_delete = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
self.layers_to_delete
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
@@ -4130,7 +4158,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
|
||||
self.flush_updates().await?;
|
||||
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
|
||||
let result = layer_map
|
||||
@@ -4169,7 +4201,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
// this is a lot more complex than a simple downcast...
|
||||
if layer.is_delta() {
|
||||
let l = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
guard.get_from_desc(layer)
|
||||
};
|
||||
let result = l.download_and_keep_resident(ctx).await?;
|
||||
@@ -4294,8 +4330,7 @@ impl TimelineAdaptor {
|
||||
self.timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
@@ -4304,10 +4339,7 @@ impl TimelineAdaptor {
|
||||
});
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: self
|
||||
.get_keyspace(key_range, lsn, ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?,
|
||||
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
|
||||
};
|
||||
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
|
||||
let outcome = self
|
||||
@@ -4326,13 +4358,9 @@ impl TimelineAdaptor {
|
||||
unfinished_image_layer,
|
||||
} = outcome
|
||||
{
|
||||
let (desc, path) = unfinished_image_layer
|
||||
.finish(ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
let (desc, path) = unfinished_image_layer.finish(ctx).await?;
|
||||
let image_layer =
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
|
||||
self.new_images.push(image_layer);
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::layer_manager::{LayerManager, LayerManagerLockHolder};
|
||||
use super::{FlushLayerError, Timeline};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -199,7 +199,10 @@ pub(crate) async fn generate_tombstone_image_layer(
|
||||
let image_lsn = ancestor_lsn;
|
||||
|
||||
{
|
||||
let layers = detached.layers.read().await;
|
||||
let layers = detached
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::DetachAncestor)
|
||||
.await;
|
||||
for layer in layers.all_persistent_layers() {
|
||||
if !layer.is_delta
|
||||
&& layer.lsn_range.start == image_lsn
|
||||
@@ -423,7 +426,7 @@ pub(super) async fn prepare(
|
||||
// we do not need to start from our layers, because they can only be layers that come
|
||||
// *after* ancestor_lsn
|
||||
let layers = tokio::select! {
|
||||
guard = ancestor.layers.read() => guard,
|
||||
guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard,
|
||||
_ = detached.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
@@ -869,7 +872,12 @@ async fn remote_copy(
|
||||
|
||||
// Double check that the file is orphan (probably from an earlier attempt), then delete it
|
||||
let key = file_name.clone().into();
|
||||
if adoptee.layers.read().await.contains_key(&key) {
|
||||
if adoptee
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::DetachAncestor)
|
||||
.await
|
||||
.contains_key(&key)
|
||||
{
|
||||
// We are supposed to filter out such cases before coming to this function
|
||||
return Err(Error::Prepare(anyhow::anyhow!(
|
||||
"layer file {file_name} already present and inside layer map"
|
||||
|
||||
@@ -33,6 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
use crate::tenant::storage_layer::LayerVisibilityHint;
|
||||
use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
|
||||
use crate::tenant::timeline::EvictionError;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -208,7 +209,7 @@ impl Timeline {
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
|
||||
|
||||
guard
|
||||
.likely_resident_layers()
|
||||
|
||||
@@ -15,6 +15,7 @@ use super::{Timeline, TimelineDeleteProgress};
|
||||
use crate::context::RequestContext;
|
||||
use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
mod flow;
|
||||
mod importbucket_client;
|
||||
@@ -163,7 +164,10 @@ async fn prepare_import(
|
||||
info!("wipe the slate clean");
|
||||
{
|
||||
// TODO: do we need to hold GC lock for this?
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
assert!(
|
||||
guard.layer_map()?.open_layer.is_none(),
|
||||
"while importing, there should be no in-memory layer" // this just seems like a good place to assert it
|
||||
|
||||
@@ -56,6 +56,7 @@ use crate::pgdatadir_mapping::{
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
pub async fn run(
|
||||
timeline: Arc<Timeline>,
|
||||
@@ -984,7 +985,10 @@ impl ChunkProcessingJob {
|
||||
let (desc, path) = writer.finish(ctx).await?;
|
||||
|
||||
{
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
let existing_layer = guard.try_get_from_key(&desc.key());
|
||||
if let Some(layer) = existing_layer {
|
||||
if layer.metadata().generation == timeline.generation {
|
||||
@@ -1007,7 +1011,10 @@ impl ChunkProcessingJob {
|
||||
// certain that the existing layer is identical to the new one, so in that case
|
||||
// we replace the old layer with the one we just generated.
|
||||
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
|
||||
let existing_layer = guard
|
||||
.try_get_from_key(&resident_layer.layer_desc().key())
|
||||
@@ -1036,7 +1043,7 @@ impl ChunkProcessingJob {
|
||||
}
|
||||
}
|
||||
|
||||
crate::tenant::timeline::drop_wlock(guard);
|
||||
crate::tenant::timeline::drop_layer_manager_wlock(guard);
|
||||
|
||||
timeline
|
||||
.remote_client
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::ManuallyDrop;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use itertools::Itertools;
|
||||
@@ -20,6 +23,155 @@ use crate::tenant::storage_layer::{
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
};
|
||||
|
||||
/// Warn if the lock was held for longer than this threshold.
|
||||
/// It's very generous and we should bring this value down over time.
|
||||
const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
|
||||
/// Describes the operation that is holding the layer manager lock
|
||||
#[derive(Debug, Clone, Copy, strum_macros::Display)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum LayerManagerLockHolder {
|
||||
GetLayerMapInfo,
|
||||
GenerateHeatmap,
|
||||
GetPage,
|
||||
Init,
|
||||
LoadLayerMap,
|
||||
GetLayerForWrite,
|
||||
TryFreezeLayer,
|
||||
FlushFrozenLayer,
|
||||
FlushLoop,
|
||||
Compaction,
|
||||
GarbageCollection,
|
||||
Shutdown,
|
||||
ImportPgData,
|
||||
DetachAncestor,
|
||||
Eviction,
|
||||
#[cfg(test)]
|
||||
Testing,
|
||||
}
|
||||
|
||||
/// Wrapper for the layer manager that tracks the amount of time during which
|
||||
/// it was held under read or write lock
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LockedLayerManager {
|
||||
locked: tokio::sync::RwLock<LayerManager>,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerManagerReadGuard<'a> {
|
||||
guard: ManuallyDrop<tokio::sync::RwLockReadGuard<'a, LayerManager>>,
|
||||
acquired_at: std::time::Instant,
|
||||
holder: LayerManagerLockHolder,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerManagerWriteGuard<'a> {
|
||||
guard: ManuallyDrop<tokio::sync::RwLockWriteGuard<'a, LayerManager>>,
|
||||
acquired_at: std::time::Instant,
|
||||
holder: LayerManagerLockHolder,
|
||||
}
|
||||
|
||||
impl Drop for LayerManagerReadGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
// Drop the lock first, before potentially warning if it was held for too long.
|
||||
// SAFETY: ManuallyDrop in Drop implementation
|
||||
unsafe { ManuallyDrop::drop(&mut self.guard) };
|
||||
|
||||
let held_for = self.acquired_at.elapsed();
|
||||
if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD {
|
||||
tracing::warn!(
|
||||
holder=%self.holder,
|
||||
"Layer manager read lock held for {}s",
|
||||
held_for.as_secs_f64(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LayerManagerWriteGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
// Drop the lock first, before potentially warning if it was held for too long.
|
||||
// SAFETY: ManuallyDrop in Drop implementation
|
||||
unsafe { ManuallyDrop::drop(&mut self.guard) };
|
||||
|
||||
let held_for = self.acquired_at.elapsed();
|
||||
if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD {
|
||||
tracing::warn!(
|
||||
holder=%self.holder,
|
||||
"Layer manager write lock held for {}s",
|
||||
held_for.as_secs_f64(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LayerManagerReadGuard<'_> {
|
||||
type Target = LayerManager;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.guard.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LayerManagerWriteGuard<'_> {
|
||||
type Target = LayerManager;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.guard.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for LayerManagerWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.guard.deref_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl LockedLayerManager {
|
||||
pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard {
|
||||
let guard = ManuallyDrop::new(self.locked.read().await);
|
||||
LayerManagerReadGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_read(
|
||||
&self,
|
||||
holder: LayerManagerLockHolder,
|
||||
) -> Result<LayerManagerReadGuard, tokio::sync::TryLockError> {
|
||||
let guard = ManuallyDrop::new(self.locked.try_read()?);
|
||||
|
||||
Ok(LayerManagerReadGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard {
|
||||
let guard = ManuallyDrop::new(self.locked.write().await);
|
||||
LayerManagerWriteGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_write(
|
||||
&self,
|
||||
holder: LayerManagerLockHolder,
|
||||
) -> Result<LayerManagerWriteGuard, tokio::sync::TryLockError> {
|
||||
let guard = ManuallyDrop::new(self.locked.try_write()?);
|
||||
|
||||
Ok(LayerManagerWriteGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub(crate) enum LayerManager {
|
||||
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
|
||||
|
||||
@@ -32,9 +32,7 @@ use utils::backoff::{
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::{
|
||||
ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
|
||||
};
|
||||
use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config};
|
||||
|
||||
use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
|
||||
use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
|
||||
@@ -991,19 +989,12 @@ impl ConnectionManagerState {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
|
||||
let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
(None, None, None)
|
||||
},
|
||||
PostgresClientProtocol::Interpreted { .. } => {
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
(
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
)
|
||||
}
|
||||
};
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
let (shard_number, shard_count, shard_stripe_size) = (
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
);
|
||||
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol: self.conf.protocol,
|
||||
@@ -1120,8 +1111,8 @@ impl ReconnectReason {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
|
||||
use url::Host;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
@@ -1552,6 +1543,11 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
ConnectionManagerState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_shard_id.tenant_id,
|
||||
@@ -1560,7 +1556,7 @@ mod tests {
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
protocol,
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
|
||||
@@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::sync::watch;
|
||||
@@ -31,7 +31,7 @@ use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::sync::gate::GateError;
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::FromWireFormat;
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
@@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
@@ -284,14 +282,16 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
compression,
|
||||
} => Some((format, compression)),
|
||||
} => (format, compression),
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
return Err(WalReceiverError::Other(anyhow!(
|
||||
"Vanilla WAL receiver protocol is no longer supported for ingest"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let mut expected_wal_start = startpoint;
|
||||
@@ -313,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Update the connection status before processing the message. If the message processing
|
||||
// fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
|
||||
match &replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
|
||||
connection_status.streaming_lsn = Some(Lsn::from(
|
||||
xlog_data.wal_start() + xlog_data.data().len() as u64,
|
||||
));
|
||||
if !xlog_data.data().is_empty() {
|
||||
connection_status.latest_wal_update = now;
|
||||
}
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
|
||||
@@ -353,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// were interpreted.
|
||||
let streaming_lsn = Lsn::from(raw.streaming_lsn());
|
||||
|
||||
let (format, compression) = interpreted_proto_config.unwrap();
|
||||
let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -509,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
Some(streaming_lsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||
|
||||
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
let mut modification = timeline.begin_modification(startlsn);
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
if !next_record_lsn.is_aligned() {
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
// these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure
|
||||
// all earlier writes of data blocks are visible by committing any modification in flight.
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
timeline.metrics.wal_records_received.inc();
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
// FIXME: this cannot be made pausable_failpoint without fixing the
|
||||
// failpoint library; in tests, the added amount of debugging will cause us
|
||||
// to timeout the tests.
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
last_rec_lsn = next_record_lsn;
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the remaining records.
|
||||
if uncommitted_records > 0 {
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {endlsn}");
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
|
||||
@@ -21,7 +21,7 @@ OBJS = \
|
||||
unstable_extensions.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
neon_ddl_handler.o \
|
||||
walsender_hooks.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
|
||||
@@ -1092,13 +1092,15 @@ communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
MyPState->ring_last <= ring_index);
|
||||
}
|
||||
|
||||
/* internal version. Returns the ring index */
|
||||
/* Internal version. Returns the ring index of the last block (result of this function is used only
|
||||
* when nblocks==1)
|
||||
*/
|
||||
static uint64
|
||||
prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
BlockNumber nblocks, const bits8 *mask,
|
||||
bool is_prefetch)
|
||||
{
|
||||
uint64 min_ring_index;
|
||||
uint64 last_ring_index;
|
||||
PrefetchRequest hashkey;
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
bool any_hits = false;
|
||||
@@ -1122,13 +1124,12 @@ Retry:
|
||||
MyPState->ring_unused - MyPState->ring_receive;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
last_ring_index = UINT64_MAX;
|
||||
|
||||
min_ring_index = UINT64_MAX;
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
PrefetchRequest *slot = NULL;
|
||||
PrfHashEntry *entry = NULL;
|
||||
uint64 ring_index;
|
||||
neon_request_lsns *lsns;
|
||||
|
||||
if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
|
||||
@@ -1152,12 +1153,12 @@ Retry:
|
||||
if (entry != NULL)
|
||||
{
|
||||
slot = entry->slot;
|
||||
ring_index = slot->my_ring_index;
|
||||
Assert(slot == GetPrfSlot(ring_index));
|
||||
last_ring_index = slot->my_ring_index;
|
||||
Assert(slot == GetPrfSlot(last_ring_index));
|
||||
|
||||
Assert(slot->status != PRFS_UNUSED);
|
||||
Assert(MyPState->ring_last <= ring_index &&
|
||||
ring_index < MyPState->ring_unused);
|
||||
Assert(MyPState->ring_last <= last_ring_index &&
|
||||
last_ring_index < MyPState->ring_unused);
|
||||
Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
|
||||
|
||||
/*
|
||||
@@ -1169,9 +1170,9 @@ Retry:
|
||||
if (!neon_prefetch_response_usable(lsns, slot))
|
||||
{
|
||||
/* Wait for the old request to finish and discard it */
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
if (!prefetch_wait_for(last_ring_index))
|
||||
goto Retry;
|
||||
prefetch_set_unused(ring_index);
|
||||
prefetch_set_unused(last_ring_index);
|
||||
entry = NULL;
|
||||
slot = NULL;
|
||||
pgBufferUsage.prefetch.expired += 1;
|
||||
@@ -1188,13 +1189,12 @@ Retry:
|
||||
*/
|
||||
if (slot->status == PRFS_TAG_REMAINS)
|
||||
{
|
||||
prefetch_set_unused(ring_index);
|
||||
prefetch_set_unused(last_ring_index);
|
||||
entry = NULL;
|
||||
slot = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
min_ring_index = Min(min_ring_index, ring_index);
|
||||
/* The buffered request is good enough, return that index */
|
||||
if (is_prefetch)
|
||||
pgBufferUsage.prefetch.duplicates++;
|
||||
@@ -1283,12 +1283,12 @@ Retry:
|
||||
* The next buffer pointed to by `ring_unused` is now definitely empty, so
|
||||
* we can insert the new request to it.
|
||||
*/
|
||||
ring_index = MyPState->ring_unused;
|
||||
last_ring_index = MyPState->ring_unused;
|
||||
|
||||
Assert(MyPState->ring_last <= ring_index &&
|
||||
ring_index <= MyPState->ring_unused);
|
||||
Assert(MyPState->ring_last <= last_ring_index &&
|
||||
last_ring_index <= MyPState->ring_unused);
|
||||
|
||||
slot = GetPrfSlotNoCheck(ring_index);
|
||||
slot = GetPrfSlotNoCheck(last_ring_index);
|
||||
|
||||
Assert(slot->status == PRFS_UNUSED);
|
||||
|
||||
@@ -1298,11 +1298,9 @@ Retry:
|
||||
*/
|
||||
slot->buftag = hashkey.buftag;
|
||||
slot->shard_no = get_shard_number(&tag);
|
||||
slot->my_ring_index = ring_index;
|
||||
slot->my_ring_index = last_ring_index;
|
||||
slot->flags = 0;
|
||||
|
||||
min_ring_index = Min(min_ring_index, ring_index);
|
||||
|
||||
if (is_prefetch)
|
||||
MyNeonCounters->getpage_prefetch_requests_total++;
|
||||
else
|
||||
@@ -1315,11 +1313,12 @@ Retry:
|
||||
MyPState->ring_unused - MyPState->ring_receive;
|
||||
|
||||
Assert(any_hits);
|
||||
Assert(last_ring_index != UINT64_MAX);
|
||||
|
||||
Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
|
||||
GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
|
||||
Assert(MyPState->ring_last <= min_ring_index &&
|
||||
min_ring_index < MyPState->ring_unused);
|
||||
Assert(GetPrfSlot(last_ring_index)->status == PRFS_REQUESTED ||
|
||||
GetPrfSlot(last_ring_index)->status == PRFS_RECEIVED);
|
||||
Assert(MyPState->ring_last <= last_ring_index &&
|
||||
last_ring_index < MyPState->ring_unused);
|
||||
|
||||
if (flush_every_n_requests > 0 &&
|
||||
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
|
||||
@@ -1335,7 +1334,7 @@ Retry:
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
}
|
||||
|
||||
return min_ring_index;
|
||||
return last_ring_index;
|
||||
}
|
||||
|
||||
static bool
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
#ifndef CONTROL_PLANE_CONNECTOR_H
|
||||
#define CONTROL_PLANE_CONNECTOR_H
|
||||
|
||||
void InitControlPlaneConnector(void);
|
||||
|
||||
#endif
|
||||
@@ -2,6 +2,6 @@ DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_
|
||||
|
||||
DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
|
||||
|
||||
DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1);
|
||||
DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer);
|
||||
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#if PG_MAJORVERSION_NUM >= 15
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#include "executor/instrument.h"
|
||||
#include "replication/logical.h"
|
||||
#include "replication/logicallauncher.h"
|
||||
#include "replication/slot.h"
|
||||
@@ -32,8 +33,9 @@
|
||||
#include "extension_server.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_ddl_handler.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "logical_replication_monitor.h"
|
||||
#include "unstable_extensions.h"
|
||||
#include "walsender_hooks.h"
|
||||
@@ -46,6 +48,13 @@ void _PG_init(void);
|
||||
|
||||
|
||||
static int running_xacts_overflow_policy;
|
||||
static bool monitor_query_exec_time = false;
|
||||
|
||||
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
|
||||
static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
|
||||
|
||||
static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
|
||||
static void neon_ExecutorEnd(QueryDesc *queryDesc);
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
@@ -445,7 +454,7 @@ _PG_init(void)
|
||||
|
||||
InitUnstableExtensionsSupport();
|
||||
InitLogicalReplicationMonitor();
|
||||
InitControlPlaneConnector();
|
||||
InitDDLHandler();
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
@@ -470,6 +479,16 @@ _PG_init(void)
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.monitor_query_exec_time",
|
||||
"Collect infortmation about query execution time",
|
||||
NULL,
|
||||
&monitor_query_exec_time,
|
||||
false,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.allow_replica_misconfig",
|
||||
"Allow replica startup when some critical GUCs have smaller value than on primary node",
|
||||
@@ -508,6 +527,11 @@ _PG_init(void)
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
|
||||
ReportSearchPath();
|
||||
|
||||
prev_ExecutorStart = ExecutorStart_hook;
|
||||
ExecutorStart_hook = neon_ExecutorStart;
|
||||
prev_ExecutorEnd = ExecutorEnd_hook;
|
||||
ExecutorEnd_hook = neon_ExecutorEnd;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
@@ -581,3 +605,55 @@ neon_shmem_startup_hook(void)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ExecutorStart hook: start up tracking if needed
|
||||
*/
|
||||
static void
|
||||
neon_ExecutorStart(QueryDesc *queryDesc, int eflags)
|
||||
{
|
||||
if (prev_ExecutorStart)
|
||||
prev_ExecutorStart(queryDesc, eflags);
|
||||
else
|
||||
standard_ExecutorStart(queryDesc, eflags);
|
||||
|
||||
if (monitor_query_exec_time)
|
||||
{
|
||||
/*
|
||||
* Set up to track total elapsed time in ExecutorRun. Make sure the
|
||||
* space is allocated in the per-query context so it will go away at
|
||||
* ExecutorEnd.
|
||||
*/
|
||||
if (queryDesc->totaltime == NULL)
|
||||
{
|
||||
MemoryContext oldcxt;
|
||||
|
||||
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
|
||||
queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false);
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ExecutorEnd hook: store results if needed
|
||||
*/
|
||||
static void
|
||||
neon_ExecutorEnd(QueryDesc *queryDesc)
|
||||
{
|
||||
if (monitor_query_exec_time && queryDesc->totaltime)
|
||||
{
|
||||
/*
|
||||
* Make sure stats accumulation is done. (Note: it's okay if several
|
||||
* levels of hook all do this.)
|
||||
*/
|
||||
InstrEndLoop(queryDesc->totaltime);
|
||||
|
||||
inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */
|
||||
}
|
||||
|
||||
if (prev_ExecutorEnd)
|
||||
prev_ExecutorEnd(queryDesc);
|
||||
else
|
||||
standard_ExecutorEnd(queryDesc);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* control_plane_connector.c
|
||||
* neon_ddl_handler.c
|
||||
* Captures updates to roles/databases using ProcessUtility_hook and
|
||||
* sends them to the control ProcessUtility_hook. The changes are sent
|
||||
* via HTTP to the URL specified by the GUC neon.console_url when the
|
||||
@@ -13,18 +13,30 @@
|
||||
* accumulate changes. On subtransaction commit, the top of the stack
|
||||
* is merged with the table below it.
|
||||
*
|
||||
* Support event triggers for neon_superuser
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/neon_dll_handler.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "access/xact.h"
|
||||
#include "catalog/pg_authid.h"
|
||||
#include "catalog/pg_proc.h"
|
||||
#include "commands/defrem.h"
|
||||
#include "commands/event_trigger.h"
|
||||
#include "commands/user.h"
|
||||
#include "fmgr.h"
|
||||
#include "libpq/crypt.h"
|
||||
#include "miscadmin.h"
|
||||
#include "nodes/makefuncs.h"
|
||||
#include "parser/parse_func.h"
|
||||
#include "tcop/pquery.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "utils/acl.h"
|
||||
@@ -32,11 +44,16 @@
|
||||
#include "utils/hsearch.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/jsonb.h"
|
||||
#include <utils/lsyscache.h>
|
||||
#include <utils/syscache.h>
|
||||
|
||||
#include "control_plane_connector.h"
|
||||
#include "neon_ddl_handler.h"
|
||||
#include "neon_utils.h"
|
||||
|
||||
static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
|
||||
static fmgr_hook_type next_fmgr_hook = NULL;
|
||||
static needs_fmgr_hook_type next_needs_fmgr_hook = NULL;
|
||||
static bool neon_event_triggers = true;
|
||||
|
||||
static const char *jwt_token = NULL;
|
||||
|
||||
@@ -773,6 +790,7 @@ HandleDropRole(DropRoleStmt *stmt)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
HandleRename(RenameStmt *stmt)
|
||||
{
|
||||
@@ -782,6 +800,460 @@ HandleRename(RenameStmt *stmt)
|
||||
return HandleRoleRename(stmt);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Support for Event Triggers.
|
||||
*
|
||||
* In vanilla only superuser can create Event Triggers.
|
||||
*
|
||||
* We allow it for neon_superuser by temporary switching to superuser. But as
|
||||
* far as event trigger can fire in superuser context we should protect
|
||||
* superuser from execution of arbitrary user's code.
|
||||
*
|
||||
* The idea was taken from Supabase PR series starting at
|
||||
* https://github.com/supabase/supautils/pull/98
|
||||
*/
|
||||
|
||||
static bool
|
||||
neon_needs_fmgr_hook(Oid functionId) {
|
||||
|
||||
return (next_needs_fmgr_hook && (*next_needs_fmgr_hook) (functionId))
|
||||
|| get_func_rettype(functionId) == EVENT_TRIGGEROID;
|
||||
}
|
||||
|
||||
static void
|
||||
LookupFuncOwnerSecDef(Oid functionId, Oid *funcOwner, bool *is_secdef)
|
||||
{
|
||||
Form_pg_proc procForm;
|
||||
HeapTuple proc_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(functionId));
|
||||
|
||||
if (!HeapTupleIsValid(proc_tup))
|
||||
ereport(ERROR,
|
||||
(errmsg("cache lookup failed for function %u", functionId)));
|
||||
|
||||
procForm = (Form_pg_proc) GETSTRUCT(proc_tup);
|
||||
|
||||
*funcOwner = procForm->proowner;
|
||||
*is_secdef = procForm->prosecdef;
|
||||
|
||||
ReleaseSysCache(proc_tup);
|
||||
}
|
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(noop);
|
||||
Datum noop(__attribute__ ((unused)) PG_FUNCTION_ARGS) { PG_RETURN_VOID();}
|
||||
|
||||
static void
|
||||
force_noop(FmgrInfo *finfo)
|
||||
{
|
||||
finfo->fn_addr = (PGFunction) noop;
|
||||
finfo->fn_oid = InvalidOid; /* not a known function OID anymore */
|
||||
finfo->fn_nargs = 0; /* no arguments for noop */
|
||||
finfo->fn_strict = false;
|
||||
finfo->fn_retset = false;
|
||||
finfo->fn_stats = 0; /* no stats collection */
|
||||
finfo->fn_extra = NULL; /* clear out old context data */
|
||||
finfo->fn_mcxt = CurrentMemoryContext;
|
||||
finfo->fn_expr = NULL; /* no parse tree */
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Skip executing Event Triggers execution for superusers, because Event
|
||||
* Triggers are SECURITY DEFINER and user provided code could then attempt
|
||||
* privilege escalation.
|
||||
*
|
||||
* Also skip executing Event Triggers when GUC neon.event_triggers has been
|
||||
* set to false. This might be necessary to be able to connect again after a
|
||||
* LOGIN Event Trigger has been installed that would prevent connections as
|
||||
* neon_superuser.
|
||||
*/
|
||||
static void
|
||||
neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
|
||||
{
|
||||
/*
|
||||
* It can be other needs_fmgr_hook which cause our hook to be invoked for
|
||||
* non-trigger function, so recheck that is is trigger function.
|
||||
*/
|
||||
if (flinfo->fn_oid != InvalidOid &&
|
||||
get_func_rettype(flinfo->fn_oid) != EVENT_TRIGGEROID)
|
||||
{
|
||||
if (next_fmgr_hook)
|
||||
(*next_fmgr_hook) (event, flinfo, private);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The neon_superuser role can use the GUC neon.event_triggers to disable
|
||||
* firing Event Trigger.
|
||||
*
|
||||
* SET neon.event_triggers TO false;
|
||||
*
|
||||
* This only applies to the neon_superuser role though, and only allows
|
||||
* skipping Event Triggers owned by neon_superuser, which we check by
|
||||
* proxy of the Event Trigger function being owned by neon_superuser.
|
||||
*
|
||||
* A role that is created in role neon_superuser should be allowed to also
|
||||
* benefit from the neon_event_triggers GUC, and will be considered the
|
||||
* same as the neon_superuser role.
|
||||
*/
|
||||
if (event == FHET_START
|
||||
&& !neon_event_triggers
|
||||
&& is_neon_superuser())
|
||||
{
|
||||
Oid neon_superuser_oid = get_role_oid("neon_superuser", false);
|
||||
|
||||
/* Find the Function Attributes (owner Oid, security definer) */
|
||||
const char *fun_owner_name = NULL;
|
||||
Oid fun_owner = InvalidOid;
|
||||
bool fun_is_secdef = false;
|
||||
|
||||
LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef);
|
||||
fun_owner_name = GetUserNameFromId(fun_owner, false);
|
||||
|
||||
if (RoleIsNeonSuperuser(fun_owner_name)
|
||||
|| has_privs_of_role(fun_owner, neon_superuser_oid))
|
||||
{
|
||||
elog(WARNING,
|
||||
"Skipping Event Trigger: neon.event_triggers is false");
|
||||
|
||||
/*
|
||||
* we can't skip execution directly inside the fmgr_hook so instead we
|
||||
* change the event trigger function to a noop function.
|
||||
*/
|
||||
force_noop(flinfo);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Fire Event Trigger if both function owner and current user are
|
||||
* superuser, or none of them are.
|
||||
*/
|
||||
else if (event == FHET_START
|
||||
/* still enable it to pass pg_regress tests */
|
||||
&& !RegressTestMode)
|
||||
{
|
||||
/*
|
||||
* Get the current user oid as of before SECURITY DEFINER change of
|
||||
* CurrentUserId, and that would be SessionUserId.
|
||||
*/
|
||||
Oid current_role_oid = GetSessionUserId();
|
||||
bool role_is_super = superuser_arg(current_role_oid);
|
||||
|
||||
/* Find the Function Attributes (owner Oid, security definer) */
|
||||
Oid function_owner = InvalidOid;
|
||||
bool function_is_secdef = false;
|
||||
bool function_is_owned_by_super = false;
|
||||
|
||||
LookupFuncOwnerSecDef(flinfo->fn_oid, &function_owner, &function_is_secdef);
|
||||
|
||||
function_is_owned_by_super = superuser_arg(function_owner);
|
||||
|
||||
/*
|
||||
* 1. Refuse to run SECURITY DEFINER function that belongs to a
|
||||
* superuser when the current user is not a superuser itself.
|
||||
*/
|
||||
if (!role_is_super
|
||||
&& function_is_owned_by_super
|
||||
&& function_is_secdef)
|
||||
{
|
||||
char *func_name = get_func_name(flinfo->fn_oid);
|
||||
|
||||
ereport(WARNING,
|
||||
(errmsg("Skipping Event Trigger"),
|
||||
errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
|
||||
"and is SECURITY DEFINER",
|
||||
func_name,
|
||||
GetUserNameFromId(function_owner, false))));
|
||||
|
||||
/*
|
||||
* we can't skip execution directly inside the fmgr_hook so
|
||||
* instead we change the event trigger function to a noop
|
||||
* function.
|
||||
*/
|
||||
force_noop(flinfo);
|
||||
}
|
||||
|
||||
/*
|
||||
* 2. Refuse to run functions that belongs to a non-superuser when the
|
||||
* current user is a superuser.
|
||||
*
|
||||
* We could run a SECURITY DEFINER user-function here and be safe with
|
||||
* privilege escalation risks, but superuser roles are only used for
|
||||
* infrastructure maintenance operations, where we prefer to skip
|
||||
* running user-defined code.
|
||||
*/
|
||||
else if (role_is_super && !function_is_owned_by_super)
|
||||
{
|
||||
char *func_name = get_func_name(flinfo->fn_oid);
|
||||
|
||||
ereport(WARNING,
|
||||
(errmsg("Skipping Event Trigger"),
|
||||
errdetail("Event Trigger function \"%s\" "
|
||||
"is owned by non-superuser role \"%s\", "
|
||||
"and current_user \"%s\" is superuser",
|
||||
func_name,
|
||||
GetUserNameFromId(function_owner, false),
|
||||
GetUserNameFromId(current_role_oid, false))));
|
||||
|
||||
/*
|
||||
* we can't skip execution directly inside the fmgr_hook so
|
||||
* instead we change the event trigger function to a noop
|
||||
* function.
|
||||
*/
|
||||
force_noop(flinfo);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (next_fmgr_hook)
|
||||
(*next_fmgr_hook) (event, flinfo, private);
|
||||
}
|
||||
|
||||
static Oid prev_role_oid = 0;
|
||||
static int prev_role_sec_context = 0;
|
||||
static bool switched_to_superuser = false;
|
||||
|
||||
/*
|
||||
* Switch tp superuser if not yet superuser.
|
||||
* Returns false if already switched to superuser.
|
||||
*/
|
||||
static bool
|
||||
switch_to_superuser(void)
|
||||
{
|
||||
Oid superuser_oid;
|
||||
|
||||
if (switched_to_superuser)
|
||||
return false;
|
||||
switched_to_superuser = true;
|
||||
|
||||
superuser_oid = get_role_oid("cloud_admin", true /*missing_ok*/);
|
||||
if (superuser_oid == InvalidOid)
|
||||
superuser_oid = BOOTSTRAP_SUPERUSERID;
|
||||
|
||||
GetUserIdAndSecContext(&prev_role_oid, &prev_role_sec_context);
|
||||
SetUserIdAndSecContext(superuser_oid, prev_role_sec_context |
|
||||
SECURITY_LOCAL_USERID_CHANGE |
|
||||
SECURITY_RESTRICTED_OPERATION);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
switch_to_original_role(void)
|
||||
{
|
||||
SetUserIdAndSecContext(prev_role_oid, prev_role_sec_context);
|
||||
switched_to_superuser = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* ALTER ROLE ... SUPERUSER;
|
||||
*
|
||||
* Used internally to give superuser to a non-privileged role to allow
|
||||
* ownership of superuser-only objects such as Event Trigger.
|
||||
*
|
||||
* ALTER ROLE foo SUPERUSER;
|
||||
* ALTER EVENT TRIGGER ... OWNED BY foo;
|
||||
* ALTER ROLE foo NOSUPERUSER;
|
||||
*
|
||||
* Now the EVENT TRIGGER is owned by foo, who can DROP it without having to be
|
||||
* superuser again.
|
||||
*/
|
||||
static void
|
||||
alter_role_super(const char* rolename, bool make_super)
|
||||
{
|
||||
AlterRoleStmt *alter_stmt = makeNode(AlterRoleStmt);
|
||||
|
||||
DefElem *defel_superuser =
|
||||
#if PG_MAJORVERSION_NUM <= 14
|
||||
makeDefElem("superuser", (Node *) makeInteger(make_super), -1);
|
||||
#else
|
||||
makeDefElem("superuser", (Node *) makeBoolean(make_super), -1);
|
||||
#endif
|
||||
|
||||
RoleSpec *rolespec = makeNode(RoleSpec);
|
||||
rolespec->roletype = ROLESPEC_CSTRING;
|
||||
rolespec->rolename = pstrdup(rolename);
|
||||
rolespec->location = -1;
|
||||
|
||||
alter_stmt->role = rolespec;
|
||||
alter_stmt->options = list_make1(defel_superuser);
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
AlterRole(alter_stmt);
|
||||
#else
|
||||
/* ParseState *pstate, AlterRoleStmt *stmt */
|
||||
AlterRole(NULL, alter_stmt);
|
||||
#endif
|
||||
|
||||
CommandCounterIncrement();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Changes the OWNER of an Event Trigger.
|
||||
*
|
||||
* Event Triggers can only be owned by superusers, so this ALTER ROLE with
|
||||
* SUPERUSER and then removes the property.
|
||||
*/
|
||||
static void
|
||||
alter_event_trigger_owner(const char *obj_name, Oid role_oid)
|
||||
{
|
||||
char* role_name = GetUserNameFromId(role_oid, false);
|
||||
|
||||
alter_role_super(role_name, true);
|
||||
|
||||
AlterEventTriggerOwner(obj_name, role_oid);
|
||||
CommandCounterIncrement();
|
||||
|
||||
alter_role_super(role_name, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Neon processing of the CREATE EVENT TRIGGER requires special attention and
|
||||
* is worth having its own ProcessUtility_hook for that.
|
||||
*/
|
||||
static void
|
||||
ProcessCreateEventTrigger(
|
||||
PlannedStmt *pstmt,
|
||||
const char *queryString,
|
||||
bool readOnlyTree,
|
||||
ProcessUtilityContext context,
|
||||
ParamListInfo params,
|
||||
QueryEnvironment *queryEnv,
|
||||
DestReceiver *dest,
|
||||
QueryCompletion *qc)
|
||||
{
|
||||
Node *parseTree = pstmt->utilityStmt;
|
||||
bool sudo = false;
|
||||
|
||||
/* We double-check that after local variable declaration block */
|
||||
CreateEventTrigStmt *stmt = (CreateEventTrigStmt *) parseTree;
|
||||
|
||||
/*
|
||||
* We are going to change the current user privileges (sudo) and might
|
||||
* need after execution cleanup. For that we want to capture the UserId
|
||||
* before changing it for our sudo implementation.
|
||||
*/
|
||||
const Oid current_user_id = GetUserId();
|
||||
bool current_user_is_super = superuser_arg(current_user_id);
|
||||
|
||||
if (nodeTag(parseTree) != T_CreateEventTrigStmt)
|
||||
{
|
||||
ereport(ERROR,
|
||||
errcode(ERRCODE_INTERNAL_ERROR),
|
||||
errmsg("ProcessCreateEventTrigger called for the wrong command"));
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow neon_superuser to create Event Trigger, while keeping the
|
||||
* ownership of the object.
|
||||
*
|
||||
* For that we give superuser membership to the role for the execution of
|
||||
* the command.
|
||||
*/
|
||||
if (IsTransactionState() && is_neon_superuser())
|
||||
{
|
||||
/* Find the Event Trigger function Oid */
|
||||
Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false);
|
||||
|
||||
/* Find the Function Owner Oid */
|
||||
Oid func_owner = InvalidOid;
|
||||
bool is_secdef = false;
|
||||
bool function_is_owned_by_super = false;
|
||||
|
||||
LookupFuncOwnerSecDef(func_oid, &func_owner, &is_secdef);
|
||||
|
||||
function_is_owned_by_super = superuser_arg(func_owner);
|
||||
|
||||
if(!current_user_is_super && function_is_owned_by_super)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("Permission denied to execute "
|
||||
"a function owned by a superuser role"),
|
||||
errdetail("current user \"%s\" is not a superuser "
|
||||
"and Event Trigger function \"%s\" "
|
||||
"is owned by a superuser",
|
||||
GetUserNameFromId(current_user_id, false),
|
||||
NameListToString(stmt->funcname))));
|
||||
}
|
||||
|
||||
if(current_user_is_super && !function_is_owned_by_super)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("Permission denied to execute "
|
||||
"a function owned by a non-superuser role"),
|
||||
errdetail("current user \"%s\" is a superuser "
|
||||
"and function \"%s\" is "
|
||||
"owned by a non-superuser",
|
||||
GetUserNameFromId(current_user_id, false),
|
||||
NameListToString(stmt->funcname))));
|
||||
}
|
||||
|
||||
sudo = switch_to_superuser();
|
||||
}
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
if (PreviousProcessUtilityHook)
|
||||
{
|
||||
PreviousProcessUtilityHook(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
else
|
||||
{
|
||||
standard_ProcessUtility(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that the Event Trigger has been installed via our sudo
|
||||
* mechanism, if the original role was not a superuser then change
|
||||
* the event trigger ownership back to the original role.
|
||||
*
|
||||
* That way [ ALTER | DROP ] EVENT TRIGGER commands just work.
|
||||
*/
|
||||
if (IsTransactionState() && is_neon_superuser())
|
||||
{
|
||||
if (!current_user_is_super)
|
||||
{
|
||||
/*
|
||||
* Change event trigger owner to the current role (making
|
||||
* it a privileged role during the ALTER OWNER command).
|
||||
*/
|
||||
alter_event_trigger_owner(stmt->trigname, current_user_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_FINALLY();
|
||||
{
|
||||
if (sudo)
|
||||
switch_to_original_role();
|
||||
}
|
||||
PG_END_TRY();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Neon hooks for DDLs (handling privileges, limiting features, etc).
|
||||
*/
|
||||
static void
|
||||
NeonProcessUtility(
|
||||
PlannedStmt *pstmt,
|
||||
@@ -795,6 +1267,27 @@ NeonProcessUtility(
|
||||
{
|
||||
Node *parseTree = pstmt->utilityStmt;
|
||||
|
||||
/*
|
||||
* The process utility hook for CREATE EVENT TRIGGER is its own
|
||||
* implementation and warrant being addressed separately from here.
|
||||
*/
|
||||
if (nodeTag(parseTree) == T_CreateEventTrigStmt)
|
||||
{
|
||||
ProcessCreateEventTrigger(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Other commands that need Neon specific implementations are handled here:
|
||||
*/
|
||||
switch (nodeTag(parseTree))
|
||||
{
|
||||
case T_CreatedbStmt:
|
||||
@@ -833,37 +1326,82 @@ NeonProcessUtility(
|
||||
if (PreviousProcessUtilityHook)
|
||||
{
|
||||
PreviousProcessUtilityHook(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
else
|
||||
{
|
||||
standard_ProcessUtility(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Only neon_superuser is granted privilege to edit neon.event_triggers GUC.
|
||||
*/
|
||||
static void
|
||||
neon_event_triggers_assign_hook(bool newval, void *extra)
|
||||
{
|
||||
/* MyDatabaseId == InvalidOid || !OidIsValid(GetUserId()) */
|
||||
|
||||
if (IsTransactionState() && !is_neon_superuser())
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("permission denied to set neon.event_triggers"),
|
||||
errdetail("Only \"neon_superuser\" is allowed to set the GUC")));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
InitControlPlaneConnector()
|
||||
InitDDLHandler()
|
||||
{
|
||||
PreviousProcessUtilityHook = ProcessUtility_hook;
|
||||
ProcessUtility_hook = NeonProcessUtility;
|
||||
|
||||
next_needs_fmgr_hook = needs_fmgr_hook;
|
||||
needs_fmgr_hook = neon_needs_fmgr_hook;
|
||||
|
||||
next_fmgr_hook = fmgr_hook;
|
||||
fmgr_hook = neon_fmgr_hook;
|
||||
|
||||
RegisterXactCallback(NeonXactCallback, NULL);
|
||||
RegisterSubXactCallback(NeonSubXactCallback, NULL);
|
||||
|
||||
/*
|
||||
* The GUC neon.event_triggers should provide the same effect as the
|
||||
* Postgres GUC event_triggers, but the neon one is PGC_USERSET.
|
||||
*
|
||||
* This allows using the GUC in the connection string and work out of a
|
||||
* LOGIN Event Trigger that would break database access, all without
|
||||
* having to edit and reload the Postgres configuration file.
|
||||
*/
|
||||
DefineCustomBoolVariable(
|
||||
"neon.event_triggers",
|
||||
"Enable firing of event triggers",
|
||||
NULL,
|
||||
&neon_event_triggers,
|
||||
true,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
neon_event_triggers_assign_hook,
|
||||
NULL);
|
||||
|
||||
DefineCustomStringVariable(
|
||||
"neon.console_url",
|
||||
"URL of the Neon Console, which will be forwarded changes to dbs and roles",
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user