Merge branch 'communicator-rewrite' of https://github.com/neondatabase/neon into communicator-rewrite

This commit is contained in:
David Freifeld
2025-07-21 15:35:49 -07:00
331 changed files with 17910 additions and 5329 deletions

View File

@@ -33,6 +33,7 @@ workspace-members = [
"compute_api", "compute_api",
"consumption_metrics", "consumption_metrics",
"desim", "desim",
"json",
"metrics", "metrics",
"pageserver_api", "pageserver_api",
"postgres_backend", "postgres_backend",

View File

@@ -27,4 +27,4 @@
!storage_controller/ !storage_controller/
!vendor/postgres-*/ !vendor/postgres-*/
!workspace_hack/ !workspace_hack/
!build_tools/patches !build-tools/patches

View File

@@ -7,6 +7,7 @@ self-hosted-runner:
- small-metal - small-metal
- small-arm64 - small-arm64
- unit-perf - unit-perf
- unit-perf-aws-arm
- us-east-2 - us-east-2
config-variables: config-variables:
- AWS_ECR_REGION - AWS_ECR_REGION
@@ -30,6 +31,7 @@ config-variables:
- NEON_PROD_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID
- PGREGRESS_PG16_PROJECT_ID - PGREGRESS_PG16_PROJECT_ID
- PGREGRESS_PG17_PROJECT_ID - PGREGRESS_PG17_PROJECT_ID
- PREWARM_PGBENCH_SIZE
- REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_CONTAINER
- REMOTE_STORAGE_AZURE_REGION - REMOTE_STORAGE_AZURE_REGION
- SLACK_CICD_CHANNEL_ID - SLACK_CICD_CHANNEL_ID

View File

@@ -176,7 +176,13 @@ runs:
fi fi
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) # We don't use code coverage for regression tests (the step is disabled),
# so there's no need to collect it.
# Ref https://github.com/neondatabase/neon/issues/4540
# cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
cov_prefix=()
# Explicitly set LLVM_PROFILE_FILE to /dev/null to avoid writing *.profraw files
export LLVM_PROFILE_FILE=/dev/null
else else
cov_prefix=() cov_prefix=()
fi fi

View File

@@ -150,7 +150,7 @@ jobs:
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
use-fallback: false use-fallback: false
path: pg_install/v14 path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
- name: Cache postgres v15 build - name: Cache postgres v15 build
id: cache_pg_15 id: cache_pg_15
@@ -162,7 +162,7 @@ jobs:
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
use-fallback: false use-fallback: false
path: pg_install/v15 path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
- name: Cache postgres v16 build - name: Cache postgres v16 build
id: cache_pg_16 id: cache_pg_16
@@ -174,7 +174,7 @@ jobs:
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
use-fallback: false use-fallback: false
path: pg_install/v16 path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
- name: Cache postgres v17 build - name: Cache postgres v17 build
id: cache_pg_17 id: cache_pg_17
@@ -186,7 +186,7 @@ jobs:
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
use-fallback: false use-fallback: false
path: pg_install/v17 path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
- name: Build all - name: Build all
# Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables

View File

@@ -219,6 +219,7 @@ jobs:
--ignore test_runner/performance/test_cumulative_statistics_persistence.py --ignore test_runner/performance/test_cumulative_statistics_persistence.py
--ignore test_runner/performance/test_perf_many_relations.py --ignore test_runner/performance/test_perf_many_relations.py
--ignore test_runner/performance/test_perf_oltp_large_tenant.py --ignore test_runner/performance/test_perf_oltp_large_tenant.py
--ignore test_runner/performance/test_lfc_prewarm.py
env: env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
env: env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
prewarm-test:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 17
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --init
steps:
- name: Harden the runner (Audit all outbound calls)
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Run prewarm benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_lfc_prewarm.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
pg_version: ${{ env.DEFAULT_PG_VERSION }}
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
generate-matrices: generate-matrices:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)

View File

@@ -72,7 +72,7 @@ jobs:
ARCHS: ${{ inputs.archs || '["x64","arm64"]' }} ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }} DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
IMAGE_TAG: | IMAGE_TAG: |
${{ hashFiles('build-tools.Dockerfile', ${{ hashFiles('build-tools/Dockerfile',
'.github/workflows/build-build-tools-image.yml') }} '.github/workflows/build-build-tools-image.yml') }}
run: | run: |
echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT} echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT}
@@ -144,7 +144,7 @@ jobs:
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
with: with:
file: build-tools.Dockerfile file: build-tools/Dockerfile
context: . context: .
provenance: false provenance: false
push: true push: true

View File

@@ -87,6 +87,29 @@ jobs:
uses: ./.github/workflows/build-build-tools-image.yml uses: ./.github/workflows/build-build-tools-image.yml
secrets: inherit secrets: inherit
lint-yamls:
needs: [ meta, check-permissions, build-build-tools-image ]
# We do need to run this in `.*-rc-pr` because of hotfixes.
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: [ self-hosted, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --init
steps:
- name: Harden the runner (Audit all outbound calls)
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- run: make -C compute manifest-schema-validation
- run: make lint-openapi-spec
check-codestyle-python: check-codestyle-python:
needs: [ meta, check-permissions, build-build-tools-image ] needs: [ meta, check-permissions, build-build-tools-image ]
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
@@ -199,28 +222,6 @@ jobs:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
secrets: inherit secrets: inherit
validate-compute-manifest:
runs-on: ubuntu-22.04
needs: [ meta, check-permissions ]
# We do need to run this in `.*-rc-pr` because of hotfixes.
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
steps:
- name: Harden the runner (Audit all outbound calls)
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
with:
node-version: '24'
- name: Validate manifest against schema
run: |
make -C compute manifest-schema-validation
build-and-test-locally: build-and-test-locally:
needs: [ meta, build-build-tools-image ] needs: [ meta, build-build-tools-image ]
# We do need to run this in `.*-rc-pr` because of hotfixes. # We do need to run this in `.*-rc-pr` because of hotfixes.
@@ -306,14 +307,14 @@ jobs:
statuses: write statuses: write
contents: write contents: write
pull-requests: write pull-requests: write
runs-on: [ self-hosted, unit-perf ] runs-on: [ self-hosted, unit-perf-aws-arm ]
container: container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials: credentials:
username: ${{ github.actor }} username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
# for changed limits, see comments on `options:` earlier in this file # for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -986,6 +987,7 @@ jobs:
- name: Verify docker-compose example and test extensions - name: Verify docker-compose example and test extensions
timeout-minutes: 60 timeout-minutes: 60
env: env:
PARALLEL_COMPUTES: 3
TAG: >- TAG: >-
${{ ${{
needs.meta.outputs.run-kind == 'compute-rc-pr' needs.meta.outputs.run-kind == 'compute-rc-pr'

View File

@@ -1,4 +1,4 @@
name: Periodic pagebench performance test on unit-perf hetzner runner name: Periodic pagebench performance test on unit-perf-aws-arm runners
on: on:
schedule: schedule:
@@ -40,7 +40,7 @@ jobs:
statuses: write statuses: write
contents: write contents: write
pull-requests: write pull-requests: write
runs-on: [ self-hosted, unit-perf ] runs-on: [ self-hosted, unit-perf-aws-arm ]
container: container:
image: ghcr.io/neondatabase/build-tools:pinned-bookworm image: ghcr.io/neondatabase/build-tools:pinned-bookworm
credentials: credentials:

View File

@@ -1,4 +1,4 @@
name: Periodic proxy performance test on unit-perf hetzner runner name: Periodic proxy performance test on unit-perf-aws-arm runners
on: on:
push: # TODO: remove after testing push: # TODO: remove after testing
@@ -32,7 +32,7 @@ jobs:
statuses: write statuses: write
contents: write contents: write
pull-requests: write pull-requests: write
runs-on: [self-hosted, unit-perf] runs-on: [self-hosted, unit-perf-aws-arm]
timeout-minutes: 60 # 1h timeout timeout-minutes: 60 # 1h timeout
container: container:
image: ghcr.io/neondatabase/build-tools:pinned-bookworm image: ghcr.io/neondatabase/build-tools:pinned-bookworm

4
.gitignore vendored
View File

@@ -16,6 +16,7 @@ neon.iml
/integration_tests/.neon /integration_tests/.neon
compaction-suite-results.* compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage # Coverage
*.profraw *.profraw
@@ -29,3 +30,6 @@ pgxn/neon/communicator/communicator_bindings.h
# pgindent typedef lists # pgindent typedef lists
*.list *.list
# Node
**/node_modules/

8
.gitmodules vendored
View File

@@ -1,16 +1,16 @@
[submodule "vendor/postgres-v14"] [submodule "vendor/postgres-v14"]
path = vendor/postgres-v14 path = vendor/postgres-v14
url = https://github.com/neondatabase/postgres.git url = ../postgres.git
branch = REL_14_STABLE_neon branch = REL_14_STABLE_neon
[submodule "vendor/postgres-v15"] [submodule "vendor/postgres-v15"]
path = vendor/postgres-v15 path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git url = ../postgres.git
branch = REL_15_STABLE_neon branch = REL_15_STABLE_neon
[submodule "vendor/postgres-v16"] [submodule "vendor/postgres-v16"]
path = vendor/postgres-v16 path = vendor/postgres-v16
url = https://github.com/neondatabase/postgres.git url = ../postgres.git
branch = REL_16_STABLE_neon branch = REL_16_STABLE_neon
[submodule "vendor/postgres-v17"] [submodule "vendor/postgres-v17"]
path = vendor/postgres-v17 path = vendor/postgres-v17
url = https://github.com/neondatabase/postgres.git url = ../postgres.git
branch = REL_17_STABLE_neon branch = REL_17_STABLE_neon

54
Cargo.lock generated
View File

@@ -1427,6 +1427,7 @@ dependencies = [
"p256 0.13.2", "p256 0.13.2",
"pageserver_page_api", "pageserver_page_api",
"postgres", "postgres",
"postgres-types",
"postgres_initdb", "postgres_initdb",
"postgres_versioninfo", "postgres_versioninfo",
"regex", "regex",
@@ -1950,6 +1951,7 @@ dependencies = [
"diesel_derives", "diesel_derives",
"itoa", "itoa",
"serde_json", "serde_json",
"uuid",
] ]
[[package]] [[package]]
@@ -3581,6 +3583,15 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "json"
version = "0.1.0"
dependencies = [
"futures",
"itoa",
"ryu",
]
[[package]] [[package]]
name = "json-structural-diff" name = "json-structural-diff"
version = "0.2.0" version = "0.2.0"
@@ -4403,6 +4414,7 @@ dependencies = [
"pageserver_client", "pageserver_client",
"pageserver_client_grpc", "pageserver_client_grpc",
"pageserver_page_api", "pageserver_page_api",
"pprof",
"rand 0.8.5", "rand 0.8.5",
"reqwest", "reqwest",
"serde", "serde",
@@ -4431,6 +4443,7 @@ dependencies = [
"pageserver_api", "pageserver_api",
"postgres_ffi", "postgres_ffi",
"remote_storage", "remote_storage",
"serde",
"serde_json", "serde_json",
"svg_fmt", "svg_fmt",
"thiserror 1.0.69", "thiserror 1.0.69",
@@ -4448,6 +4461,7 @@ dependencies = [
"arc-swap", "arc-swap",
"async-compression", "async-compression",
"async-stream", "async-stream",
"base64 0.22.1",
"bincode", "bincode",
"bit_field", "bit_field",
"byteorder", "byteorder",
@@ -4609,30 +4623,18 @@ version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
"async-trait",
"bytes", "bytes",
"chrono",
"compute_api", "compute_api",
"dashmap 5.5.0",
"futures", "futures",
"http 1.1.0",
"hyper 1.6.0",
"hyper-util",
"metrics",
"pageserver_api", "pageserver_api",
"pageserver_page_api", "pageserver_page_api",
"priority-queue",
"rand 0.8.5",
"scopeguard",
"thiserror 1.0.69",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util", "tokio-util",
"tonic 0.13.1", "tonic 0.13.1",
"tower 0.4.13",
"tracing", "tracing",
"utils", "utils",
"uuid", "workspace_hack",
] ]
[[package]] [[package]]
@@ -5248,17 +5250,6 @@ dependencies = [
"elliptic-curve 0.13.8", "elliptic-curve 0.13.8",
] ]
[[package]]
name = "priority-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5676d703dda103cbb035b653a9f11448c0a7216c7926bd35fcb5865475d0c970"
dependencies = [
"autocfg",
"equivalent",
"indexmap 2.9.0",
]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.94" version = "1.0.94"
@@ -5428,6 +5419,7 @@ dependencies = [
"async-trait", "async-trait",
"atomic-take", "atomic-take",
"aws-config", "aws-config",
"aws-credential-types",
"aws-sdk-iam", "aws-sdk-iam",
"aws-sigv4", "aws-sigv4",
"base64 0.22.1", "base64 0.22.1",
@@ -5467,6 +5459,7 @@ dependencies = [
"itoa", "itoa",
"jose-jwa", "jose-jwa",
"jose-jwk", "jose-jwk",
"json",
"lasso", "lasso",
"measured", "measured",
"metrics", "metrics",
@@ -5892,6 +5885,8 @@ dependencies = [
"azure_identity", "azure_identity",
"azure_storage", "azure_storage",
"azure_storage_blobs", "azure_storage_blobs",
"base64 0.22.1",
"byteorder",
"bytes", "bytes",
"camino", "camino",
"camino-tempfile", "camino-tempfile",
@@ -6383,6 +6378,7 @@ dependencies = [
"itertools 0.10.5", "itertools 0.10.5",
"jsonwebtoken", "jsonwebtoken",
"metrics", "metrics",
"nix 0.30.1",
"once_cell", "once_cell",
"pageserver_api", "pageserver_api",
"parking_lot 0.12.1", "parking_lot 0.12.1",
@@ -6390,6 +6386,7 @@ dependencies = [
"postgres-protocol", "postgres-protocol",
"postgres_backend", "postgres_backend",
"postgres_ffi", "postgres_ffi",
"postgres_ffi_types",
"postgres_versioninfo", "postgres_versioninfo",
"pprof", "pprof",
"pq_proto", "pq_proto",
@@ -6434,7 +6431,7 @@ dependencies = [
"anyhow", "anyhow",
"const_format", "const_format",
"pageserver_api", "pageserver_api",
"postgres_ffi", "postgres_ffi_types",
"postgres_versioninfo", "postgres_versioninfo",
"pq_proto", "pq_proto",
"serde", "serde",
@@ -7113,6 +7110,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"utils", "utils",
"uuid",
"workspace_hack", "workspace_hack",
] ]
@@ -7176,6 +7174,7 @@ dependencies = [
"pageserver_api", "pageserver_api",
"pageserver_client", "pageserver_client",
"reqwest", "reqwest",
"safekeeper_api",
"serde_json", "serde_json",
"storage_controller_client", "storage_controller_client",
"tokio", "tokio",
@@ -7755,6 +7754,7 @@ dependencies = [
"futures-core", "futures-core",
"pin-project-lite", "pin-project-lite",
"tokio", "tokio",
"tokio-util",
] ]
[[package]] [[package]]
@@ -8408,6 +8408,7 @@ dependencies = [
"tracing-error", "tracing-error",
"tracing-subscriber", "tracing-subscriber",
"tracing-utils", "tracing-utils",
"uuid",
"walkdir", "walkdir",
] ]
@@ -8955,8 +8956,10 @@ dependencies = [
"fail", "fail",
"form_urlencoded", "form_urlencoded",
"futures-channel", "futures-channel",
"futures-core",
"futures-executor", "futures-executor",
"futures-io", "futures-io",
"futures-sink",
"futures-util", "futures-util",
"generic-array", "generic-array",
"getrandom 0.2.11", "getrandom 0.2.11",
@@ -9025,7 +9028,6 @@ dependencies = [
"tracing-log", "tracing-log",
"tracing-subscriber", "tracing-subscriber",
"url", "url",
"uuid",
"zeroize", "zeroize",
"zstd", "zstd",
"zstd-safe", "zstd-safe",

View File

@@ -44,6 +44,7 @@ members = [
"libs/walproposer", "libs/walproposer",
"libs/wal_decoder", "libs/wal_decoder",
"libs/postgres_initdb", "libs/postgres_initdb",
"libs/proxy/json",
"libs/proxy/postgres-protocol2", "libs/proxy/postgres-protocol2",
"libs/proxy/postgres-types2", "libs/proxy/postgres-types2",
"libs/proxy/tokio-postgres2", "libs/proxy/tokio-postgres2",
@@ -204,7 +205,7 @@ tokio = { version = "1.43.1", features = ["macros"] }
tokio-io-timeout = "1.2.0" tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0" tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]} tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
tokio-stream = "0.1" tokio-stream = { version = "0.1", features = ["sync"] }
tokio-tar = "0.3" tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] } tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
toml = "0.8" toml = "0.8"

View File

@@ -2,7 +2,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# Where to install Postgres, default is ./pg_install, maybe useful for package # Where to install Postgres, default is ./pg_install, maybe useful for package
# managers. # managers.
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install
# Supported PostgreSQL versions # Supported PostgreSQL versions
POSTGRES_VERSIONS = v17 v16 v15 v14 POSTGRES_VERSIONS = v17 v16 v15 v14
@@ -14,7 +14,7 @@ POSTGRES_VERSIONS = v17 v16 v15 v14
# it is derived from BUILD_TYPE. # it is derived from BUILD_TYPE.
# All intermediate build artifacts are stored here. # All intermediate build artifacts are stored here.
BUILD_DIR := build BUILD_DIR := $(ROOT_PROJECT_DIR)/build
ICU_PREFIX_DIR := /usr/local/icu ICU_PREFIX_DIR := /usr/local/icu
@@ -212,7 +212,7 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \ FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \ PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-C $(BUILD_DIR)/neon-v17 \ -C $(BUILD_DIR)/pgxn-v17/neon \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
@@ -220,6 +220,19 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
setup-pre-commit-hook: setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
build-tools/node_modules: build-tools/package.json
cd build-tools && $(if $(CI),npm ci,npm install)
touch build-tools/node_modules
.PHONY: lint-openapi-spec
lint-openapi-spec: build-tools/node_modules
# operation-2xx-response: pageserver timeline delete returns 404 on success
find . -iname "openapi_spec.y*ml" -exec\
npx --prefix=build-tools/ redocly\
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
lint {} \+
# Targets for building PostgreSQL are defined in postgres.mk. # Targets for building PostgreSQL are defined in postgres.mk.
# #
# But if the caller has indicated that PostgreSQL is already # But if the caller has indicated that PostgreSQL is already

View File

@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
set -e && \ set -e && \
@@ -188,6 +188,12 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install node
ENV NODE_VERSION=24
RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \
&& apt install -y nodejs \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install docker # Install docker
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
@@ -311,14 +317,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
. "$HOME/.cargo/env" && \ . "$HOME/.cargo/env" && \
cargo --version && rustup --version && \ cargo --version && rustup --version && \
rustup component add llvm-tools rustfmt clippy && \ rustup component add llvm-tools rustfmt clippy && \
cargo install rustfilt --version ${RUSTFILT_VERSION} --locked && \ cargo install rustfilt --locked --version ${RUSTFILT_VERSION} && \
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} --locked && \ cargo install cargo-hakari --locked --version ${CARGO_HAKARI_VERSION} && \
cargo install cargo-deny --version ${CARGO_DENY_VERSION} --locked && \ cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
cargo install cargo-hack --version ${CARGO_HACK_VERSION} --locked && \ cargo install cargo-hack --locked --version ${CARGO_HACK_VERSION} && \
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} --locked && \ cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \
cargo install cargo-chef --version ${CARGO_CHEF_VERSION} --locked && \ cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} --locked \ cargo install diesel_cli --locked --version ${CARGO_DIESEL_CLI_VERSION} \
--features postgres-bundled --no-default-features && \ --features postgres-bundled --no-default-features && \
rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git rm -rf /home/nonroot/.cargo/git

3189
build-tools/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

8
build-tools/package.json Normal file
View File

@@ -0,0 +1,8 @@
{
"name": "build-tools",
"private": true,
"devDependencies": {
"@redocly/cli": "1.34.4",
"@sourcemeta/jsonschema": "10.0.0"
}
}

View File

@@ -1,9 +1,12 @@
disallowed-methods = [ disallowed-methods = [
"tokio::task::block_in_place", "tokio::task::block_in_place",
# Allow this for now, to deny it later once we stop using Handle::block_on completely # Allow this for now, to deny it later once we stop using Handle::block_on completely
# "tokio::runtime::Handle::block_on", # "tokio::runtime::Handle::block_on",
# use tokio_epoll_uring_ext instead
"tokio_epoll_uring::thread_local_system", # tokio-epoll-uring:
# - allow-invalid because the method doesn't exist on macOS
{ path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
] ]
disallowed-macros = [ disallowed-macros = [

View File

@@ -50,9 +50,9 @@ jsonnetfmt-format:
jsonnetfmt --in-place $(jsonnet_files) jsonnetfmt --in-place $(jsonnet_files)
.PHONY: manifest-schema-validation .PHONY: manifest-schema-validation
manifest-schema-validation: node_modules manifest-schema-validation: ../build-tools/node_modules
node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml npx --prefix=../build-tools/ jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
node_modules: package.json ../build-tools/node_modules: ../build-tools/package.json
npm install cd ../build-tools && $(if $(CI),npm ci,npm install)
touch node_modules touch ../build-tools/node_modules

View File

@@ -9,7 +9,7 @@
# #
# build-tools: This contains Rust compiler toolchain and other tools needed at compile # build-tools: This contains Rust compiler toolchain and other tools needed at compile
# time. This is also used for the storage builds. This image is defined in # time. This is also used for the storage builds. This image is defined in
# build-tools.Dockerfile. # build-tools/Dockerfile.
# #
# build-deps: Contains C compiler, other build tools, and compile-time dependencies # build-deps: Contains C compiler, other build tools, and compile-time dependencies
# needed to compile PostgreSQL and most extensions. (Some extensions need # needed to compile PostgreSQL and most extensions. (Some extensions need
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
FROM $BASE_IMAGE_SHA AS build-deps FROM $BASE_IMAGE_SHA AS build-deps
ARG DEBIAN_VERSION ARG DEBIAN_VERSION
# Keep in sync with build-tools.Dockerfile # Keep in sync with build-tools/Dockerfile
ENV PROTOC_VERSION=25.1 ENV PROTOC_VERSION=25.1
# Use strict mode for bash to catch errors early # Use strict mode for bash to catch errors early
@@ -170,7 +170,29 @@ RUN case $DEBIAN_VERSION in \
FROM build-deps AS pg-build FROM build-deps AS pg-build
ARG PG_VERSION ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres COPY vendor/postgres-${PG_VERSION:?} postgres
COPY compute/patches/postgres_fdw.patch .
COPY compute/patches/pg_stat_statements_pg14-16.patch .
COPY compute/patches/pg_stat_statements_pg17.patch .
RUN cd postgres && \ RUN cd postgres && \
# Apply patches to some contrib extensions
# For example, we need to grant EXECUTE on pg_stat_statements_reset() to {privileged_role_name}.
# In vanilla Postgres this function is limited to Postgres role superuser.
# In Neon we have {privileged_role_name} role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the Postgres repository but it would be hard to maintain,
# whenever we need to pick up a new Postgres version and we want to limit the changes in our Postgres fork,
# so we do it here.
case "${PG_VERSION}" in \
"v14" | "v15" | "v16") \
patch -p1 < /pg_stat_statements_pg14-16.patch; \
;; \
"v17") \
patch -p1 < /pg_stat_statements_pg17.patch; \
;; \
*) \
# To do not forget to migrate patches to the next major version
echo "No contrib patches for this PostgreSQL version" && exit 1;; \
esac && \
patch -p1 < /postgres_fdw.patch && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \ --with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \ if [ "${PG_VERSION:?}" != "v14" ]; then \
@@ -184,8 +206,6 @@ RUN cd postgres && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
@@ -195,34 +215,7 @@ RUN cd postgres && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
# Note that there are no downgrade scripts for pg_stat_statements, so we \
# don't have to modify any downgrade paths or (much) older versions: we only \
# have to make sure every creation of the pg_stat_statements_reset function \
# also adds execute permissions to the neon_superuser.
case $filename in \
pg_stat_statements--1.4.sql) \
# pg_stat_statements_reset is first created with 1.4
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
;; \
pg_stat_statements--1.6--1.7.sql) \
# Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
;; \
pg_stat_statements--1.10--1.11.sql) \
# Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
;; \
esac; \
done;
# Set PATH for all the subsequent build steps # Set PATH for all the subsequent build steps
ENV PATH="/usr/local/pgsql/bin:$PATH" ENV PATH="/usr/local/pgsql/bin:$PATH"
@@ -1524,7 +1517,7 @@ WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch . COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch . COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules # pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # allow {privileged_role_name} to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension() # - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence # - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1790,7 +1783,7 @@ RUN set -e \
######################################################################################### #########################################################################################
FROM build-deps AS exporters FROM build-deps AS exporters
ARG TARGETARCH ARG TARGETARCH
# Keep sql_exporter version same as in build-tools.Dockerfile and # Keep sql_exporter version same as in build-tools/Dockerfile and
# test_runner/regress/test_compute_metrics.py # test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n` # See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\ RUN if [ "$TARGETARCH" = "amd64" ]; then\
@@ -1915,10 +1908,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /e
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \ RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
ENV PATH=/usr/local/pgsql/bin:$PATH ENV PATH=/usr/local/pgsql/bin:$PATH
ENV PGHOST=compute ENV PGHOST=compute1
ENV PGPORT=55433 ENV PGPORT=55433
ENV PGUSER=cloud_admin ENV PGUSER=cloud_admin
ENV PGDATABASE=postgres ENV PGDATABASE=postgres

View File

@@ -1,7 +0,0 @@
{
"name": "neon-compute",
"private": true,
"dependencies": {
"@sourcemeta/jsonschema": "9.3.4"
}
}

View File

@@ -1,22 +1,26 @@
diff --git a/sql/anon.sql b/sql/anon.sql diff --git a/sql/anon.sql b/sql/anon.sql
index 0cdc769..b450327 100644 index 0cdc769..5eab1d6 100644
--- a/sql/anon.sql --- a/sql/anon.sql
+++ b/sql/anon.sql +++ b/sql/anon.sql
@@ -1141,3 +1141,15 @@ $$ @@ -1141,3 +1141,19 @@ $$
-- TODO : https://en.wikipedia.org/wiki/L-diversity -- TODO : https://en.wikipedia.org/wiki/L-diversity
-- TODO : https://en.wikipedia.org/wiki/T-closeness -- TODO : https://en.wikipedia.org/wiki/T-closeness
+ +
+-- NEON Patches +-- NEON Patches
+ +
+GRANT ALL ON SCHEMA anon to neon_superuser;
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
+
+DO $$ +DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN +BEGIN
+ IF current_setting('server_version_num')::int >= 150000 THEN + privileged_role_name := current_setting('neon.privileged_role_name');
+ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser; +
+ END IF; + EXECUTE format('GRANT ALL ON SCHEMA anon to %I', privileged_role_name);
+ EXECUTE format('GRANT ALL ON ALL TABLES IN SCHEMA anon TO %I', privileged_role_name);
+
+ IF current_setting('server_version_num')::int >= 150000 THEN
+ EXECUTE format('GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO %I', privileged_role_name);
+ END IF;
+END $$; +END $$;
diff --git a/sql/init.sql b/sql/init.sql diff --git a/sql/init.sql b/sql/init.sql
index 7da6553..9b6164b 100644 index 7da6553..9b6164b 100644

View File

@@ -21,13 +21,21 @@ index 3235cc8..6b892bc 100644
include Makefile.global include Makefile.global
diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
index d777d76..af60106 100644 index d777d76..3b54396 100644
--- a/sql/pg_duckdb--0.2.0--0.3.0.sql --- a/sql/pg_duckdb--0.2.0--0.3.0.sql
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; @@ -1056,3 +1056,14 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser; +
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser; +DO $$
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser; +DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO %I', privileged_role_name);
+ EXECUTE format('GRANT ALL ON TABLE duckdb.extensions TO %I', privileged_role_name);
+ EXECUTE format('GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO %I', privileged_role_name);
+END $$;

View File

@@ -0,0 +1,34 @@
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
index 58cdf600fce..8be57a996f6 100644
--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
-- Don't want this to be available to non-superusers.
REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
+END $$;
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
index 6fc3fed4c93..256345a8f79 100644
--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
-- Don't want this to be available to non-superusers.
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
+END $$;

View File

@@ -0,0 +1,52 @@
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
index 0bb2c397711..32764db1d8b 100644
--- a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
@@ -80,3 +80,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
-- Don't want this to be available to non-superusers.
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) FROM PUBLIC;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO %I', privileged_role_name);
+END $$;
\ No newline at end of file
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
index 58cdf600fce..8be57a996f6 100644
--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
-- Don't want this to be available to non-superusers.
REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
+END $$;
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
index 6fc3fed4c93..256345a8f79 100644
--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
-- Don't want this to be available to non-superusers.
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
+END $$;

View File

@@ -0,0 +1,17 @@
diff --git a/contrib/postgres_fdw/postgres_fdw--1.0.sql b/contrib/postgres_fdw/postgres_fdw--1.0.sql
index a0f0fc1bf45..ee077f2eea6 100644
--- a/contrib/postgres_fdw/postgres_fdw--1.0.sql
+++ b/contrib/postgres_fdw/postgres_fdw--1.0.sql
@@ -16,3 +16,12 @@ LANGUAGE C STRICT;
CREATE FOREIGN DATA WRAPPER postgres_fdw
HANDLER postgres_fdw_handler
VALIDATOR postgres_fdw_validator;
+
+DO $$
+DECLARE
+ privileged_role_name text;
+BEGIN
+ privileged_role_name := current_setting('neon.privileged_role_name');
+
+ EXECUTE format('GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO %I', privileged_role_name);
+END $$;

View File

@@ -66,7 +66,7 @@ url.workspace = true
uuid.workspace = true uuid.workspace = true
walkdir.workspace = true walkdir.workspace = true
x509-cert.workspace = true x509-cert.workspace = true
postgres-types.workspace = true
postgres_versioninfo.workspace = true postgres_versioninfo.workspace = true
postgres_initdb.workspace = true postgres_initdb.workspace = true
compute_api.workspace = true compute_api.workspace = true

View File

@@ -46,11 +46,14 @@ stateDiagram-v2
Configuration --> Failed : Failed to configure the compute Configuration --> Failed : Failed to configure the compute
Configuration --> Running : Compute has been configured Configuration --> Running : Compute has been configured
Empty --> Init : Compute spec is immediately available Empty --> Init : Compute spec is immediately available
Empty --> TerminationPending : Requested termination Empty --> TerminationPendingFast : Requested termination
Empty --> TerminationPendingImmediate : Requested termination
Init --> Failed : Failed to start Postgres Init --> Failed : Failed to start Postgres
Init --> Running : Started Postgres Init --> Running : Started Postgres
Running --> TerminationPending : Requested termination Running --> TerminationPendingFast : Requested termination
TerminationPending --> Terminated : Terminated compute Running --> TerminationPendingImmediate : Requested termination
TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
TerminationPendingImmediate --> Terminated : Terminated compute immediately
Failed --> [*] : Compute exited Failed --> [*] : Compute exited
Terminated --> [*] : Compute exited Terminated --> [*] : Compute exited
``` ```

View File

@@ -87,6 +87,14 @@ struct Cli {
#[arg(short = 'C', long, value_name = "DATABASE_URL")] #[arg(short = 'C', long, value_name = "DATABASE_URL")]
pub connstr: String, pub connstr: String,
#[arg(
long,
default_value = "neon_superuser",
value_name = "PRIVILEGED_ROLE_NAME",
value_parser = Self::parse_privileged_role_name
)]
pub privileged_role_name: String,
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
#[arg(long, default_value = "neon-postgres")] #[arg(long, default_value = "neon-postgres")]
pub cgroup: String, pub cgroup: String,
@@ -149,6 +157,21 @@ impl Cli {
Ok(url) Ok(url)
} }
/// For simplicity, we do not escape `privileged_role_name` anywhere in the code.
/// Since it's a system role, which we fully control, that's fine. Still, let's
/// validate it to avoid any surprises.
fn parse_privileged_role_name(value: &str) -> Result<String> {
use regex::Regex;
let pattern = Regex::new(r"^[a-z_]+$").unwrap();
if !pattern.is_match(value) {
bail!("--privileged-role-name can only contain lowercase letters and underscores")
}
Ok(value.to_string())
}
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@@ -178,6 +201,7 @@ fn main() -> Result<()> {
ComputeNodeParams { ComputeNodeParams {
compute_id: cli.compute_id, compute_id: cli.compute_id,
connstr, connstr,
privileged_role_name: cli.privileged_role_name.clone(),
pgdata: cli.pgdata.clone(), pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(), pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin), pgversion: get_pg_version_string(&cli.pgbin),
@@ -327,4 +351,49 @@ mod test {
]) ])
.expect_err("URL parameters are not allowed"); .expect_err("URL parameters are not allowed");
} }
#[test]
fn verify_privileged_role_name() {
// Valid name
let cli = Cli::parse_from([
"compute_ctl",
"--pgdata=test",
"--connstr=test",
"--compute-id=test",
"--privileged-role-name",
"my_superuser",
]);
assert_eq!(cli.privileged_role_name, "my_superuser");
// Invalid names
Cli::try_parse_from([
"compute_ctl",
"--pgdata=test",
"--connstr=test",
"--compute-id=test",
"--privileged-role-name",
"NeonSuperuser",
])
.expect_err("uppercase letters are not allowed");
Cli::try_parse_from([
"compute_ctl",
"--pgdata=test",
"--connstr=test",
"--compute-id=test",
"--privileged-role-name",
"$'neon_superuser",
])
.expect_err("special characters are not allowed");
Cli::try_parse_from([
"compute_ctl",
"--pgdata=test",
"--connstr=test",
"--compute-id=test",
"--privileged-role-name",
"",
])
.expect_err("empty name is not allowed");
}
} }

View File

@@ -1,13 +1,13 @@
use anyhow::{Context, Result, anyhow}; use anyhow::{Context, Result};
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use compute_api::privilege::Privilege; use compute_api::privilege::Privilege;
use compute_api::responses::{ use compute_api::responses::{
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
LfcPrewarmState, TlsConfig, LfcPrewarmState, PromoteState, TlsConfig,
}; };
use compute_api::spec::{ use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
PageserverShardConnectionInfo, PgIdent, PageserverProtocol, PageserverShardConnectionInfo, PageserverShardInfo, PgIdent,
}; };
use futures::StreamExt; use futures::StreamExt;
use futures::future::join_all; use futures::future::join_all;
@@ -30,8 +30,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use std::{env, fs}; use std::{env, fs};
use tokio::task::JoinHandle; use tokio::{spawn, sync::watch, task::JoinHandle, time};
use tokio::{spawn, time};
use tracing::{Instrument, debug, error, info, instrument, warn}; use tracing::{Instrument, debug, error, info, instrument, warn};
use url::Url; use url::Url;
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
@@ -76,12 +75,20 @@ const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;
/// Static configuration params that don't change after startup. These mostly /// Static configuration params that don't change after startup. These mostly
/// come from the CLI args, or are derived from them. /// come from the CLI args, or are derived from them.
#[derive(Clone, Debug)]
pub struct ComputeNodeParams { pub struct ComputeNodeParams {
/// The ID of the compute /// The ID of the compute
pub compute_id: String, pub compute_id: String,
// Url type maintains proper escaping
/// Url type maintains proper escaping
pub connstr: url::Url, pub connstr: url::Url,
/// The name of the 'weak' superuser role, which we give to the users.
/// It follows the allow list approach, i.e., we take a standard role
/// and grant it extra permissions with explicit GRANTs here and there,
/// and core patches.
pub privileged_role_name: String,
pub resize_swap_on_bind: bool, pub resize_swap_on_bind: bool,
pub set_disk_quota_for_fs: Option<String>, pub set_disk_quota_for_fs: Option<String>,
@@ -176,6 +183,7 @@ pub struct ComputeState {
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
/// mode == ComputeMode::Primary. None otherwise /// mode == ComputeMode::Primary. None otherwise
pub terminate_flush_lsn: Option<Lsn>, pub terminate_flush_lsn: Option<Lsn>,
pub promote_state: Option<watch::Receiver<PromoteState>>,
pub metrics: ComputeMetrics, pub metrics: ComputeMetrics,
} }
@@ -193,6 +201,7 @@ impl ComputeState {
lfc_prewarm_state: LfcPrewarmState::default(), lfc_prewarm_state: LfcPrewarmState::default(),
lfc_offload_state: LfcOffloadState::default(), lfc_offload_state: LfcOffloadState::default(),
terminate_flush_lsn: None, terminate_flush_lsn: None,
promote_state: None,
} }
} }
@@ -272,53 +281,114 @@ impl ParsedSpec {
} }
} }
fn extract_pageserver_conninfo_from_guc( /// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
pageserver_connstring_guc: &str, ///
) -> PageserverConnectionInfo { /// This is used for backwards-compatilibity, to parse the legacye `pageserver_connstr`
PageserverConnectionInfo { /// field in the compute spec, or the 'neon.pageserver_connstring' GUC. Nowadays, the
shards: pageserver_connstring_guc /// 'pageserver_connection_info' field should be used instead.
.split(',') fn extract_pageserver_conninfo_from_connstr(
.enumerate() connstr: &str,
.map(|(i, connstr)| { stripe_size: Option<u32>,
( ) -> Result<PageserverConnectionInfo, anyhow::Error> {
i as u32, let shard_infos: Vec<_> = connstr
PageserverShardConnectionInfo { .split(',')
libpq_url: Some(connstr.to_string()), .map(|connstr| PageserverShardInfo {
grpc_url: None, pageservers: vec![PageserverShardConnectionInfo {
}, id: None,
) libpq_url: Some(connstr.to_string()),
grpc_url: None,
}],
})
.collect();
match shard_infos.len() {
0 => anyhow::bail!("empty connection string"),
1 => {
// We assume that if there's only connection string, it means "unsharded",
// rather than a sharded system with just a single shard. The latter is
// possible in principle, but we never do it.
let shard_count = ShardCount::unsharded();
let only_shard = shard_infos.first().unwrap().clone();
let shards = vec![(ShardIndex::unsharded(), only_shard)];
Ok(PageserverConnectionInfo {
shard_count,
stripe_size: None,
shards: shards.into_iter().collect(),
prefer_protocol: PageserverProtocol::Libpq,
}) })
.collect(), }
prefer_grpc: false, n => {
if stripe_size.is_none() {
anyhow::bail!("{n} shards but no stripe_size");
}
let shard_count = ShardCount(n.try_into()?);
let shards = shard_infos
.into_iter()
.enumerate()
.map(|(idx, shard_info)| {
(
ShardIndex {
shard_count,
shard_number: ShardNumber(
idx.try_into().expect("shard number fits in u8"),
),
},
shard_info,
)
})
.collect();
Ok(PageserverConnectionInfo {
shard_count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::Libpq,
})
}
} }
} }
impl TryFrom<ComputeSpec> for ParsedSpec { impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = String; type Error = anyhow::Error;
fn try_from(spec: ComputeSpec) -> Result<Self, String> { fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
// Extract the options from the spec file that are needed to connect to // Extract the options from the spec file that are needed to connect to
// the storage system. // the storage system.
// //
// For backwards-compatibility, the top-level fields in the spec file // In compute specs generated by old control plane versions, the spec file might
// may be empty. In that case, we need to dig them from the GUCs in the // be missing the `pageserver_connection_info` field. In that case, we need to dig
// cluster.settings field. // the pageserver connection info from the `pageserver_connstr` field instead, or
let pageserver_conninfo = match &spec.pageserver_connection_info { // if that's missing too, from the GUC in the cluster.settings field.
Some(x) => x.clone(), let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
None => { if pageserver_conninfo.is_none() {
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") { if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
extract_pageserver_conninfo_from_guc(&guc) pageserver_conninfo = Some(extract_pageserver_conninfo_from_connstr(
} else { pageserver_connstr_field,
return Err("pageserver connstr should be provided".to_string()); spec.shard_stripe_size,
} )?);
} }
}; }
if pageserver_conninfo.is_none() {
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
{
Some(u32::from_str(&guc)?)
} else {
None
};
pageserver_conninfo =
Some(extract_pageserver_conninfo_from_connstr(&guc, stripe_size)?);
}
}
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
"pageserver connection information should be provided"
))?;
// Similarly for safekeeper connection strings
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() { let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) { if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster spec.cluster
.settings .settings
.find("neon.safekeepers") .find("neon.safekeepers")
.ok_or("safekeeper connstrings should be provided")? .ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
.split(',') .split(',')
.map(|str| str.to_string()) .map(|str| str.to_string())
.collect() .collect()
@@ -333,22 +403,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id tenant_id
} else { } else {
spec.cluster let guc = spec
.cluster
.settings .settings
.find("neon.tenant_id") .find("neon.tenant_id")
.ok_or("tenant id should be provided") .ok_or(anyhow::anyhow!("tenant id should be provided"))?;
.map(|s| TenantId::from_str(&s))? TenantId::from_str(&guc).context("invalid tenant id")?
.or(Err("invalid tenant id"))?
}; };
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
timeline_id timeline_id
} else { } else {
spec.cluster let guc = spec
.cluster
.settings .settings
.find("neon.timeline_id") .find("neon.timeline_id")
.ok_or("timeline id should be provided") .ok_or(anyhow::anyhow!("timeline id should be provided"))?;
.map(|s| TimelineId::from_str(&s))? TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
.or(Err("invalid timeline id"))?
}; };
let endpoint_storage_addr: Option<String> = spec let endpoint_storage_addr: Option<String> = spec
@@ -372,7 +442,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
}; };
// Now check validity of the parsed specification // Now check validity of the parsed specification
res.validate()?; res.validate().map_err(anyhow::Error::msg)?;
Ok(res) Ok(res)
} }
} }
@@ -452,7 +522,7 @@ impl ComputeNode {
let mut new_state = ComputeState::new(); let mut new_state = ComputeState::new();
if let Some(spec) = config.spec { if let Some(spec) = config.spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?; let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec); new_state.pspec = Some(pspec);
} }
@@ -983,14 +1053,20 @@ impl ComputeNode {
None None
}; };
let mut delay_exit = false;
let mut state = self.state.lock().unwrap(); let mut state = self.state.lock().unwrap();
state.terminate_flush_lsn = lsn; state.terminate_flush_lsn = lsn;
if let ComputeStatus::TerminationPending { mode } = state.status {
let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
if state.status == ComputeStatus::TerminationPendingFast
|| state.status == ComputeStatus::TerminationPendingImmediate
{
info!(
"Changing compute status from {} to {}",
state.status,
ComputeStatus::Terminated
);
state.status = ComputeStatus::Terminated; state.status = ComputeStatus::Terminated;
self.state_changed.notify_all(); self.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = mode == compute_api::responses::TerminateMode::Fast
} }
drop(state); drop(state);
@@ -1054,12 +1130,13 @@ impl ComputeNode {
let spec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = compute_state.pspec.as_ref().expect("spec must be set");
let started = Instant::now(); let started = Instant::now();
let (connected, size) = if spec.pageserver_conninfo.prefer_grpc { let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
self.try_get_basebackup_grpc(spec, lsn)? PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
} else { PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
self.try_get_basebackup_libpq(spec, lsn)?
}; };
self.fix_zenith_signal_neon_signal()?;
let mut state = self.state.lock().unwrap(); let mut state = self.state.lock().unwrap();
state.metrics.pageserver_connect_micros = state.metrics.pageserver_connect_micros =
connected.duration_since(started).as_micros() as u64; connected.duration_since(started).as_micros() as u64;
@@ -1069,27 +1146,56 @@ impl ComputeNode {
Ok(()) Ok(())
} }
/// Move the Zenith signal file to Neon signal file location.
/// This makes Compute compatible with older PageServers that don't yet
/// know about the Zenith->Neon rename.
fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
let datadir = Path::new(&self.params.pgdata);
let neonsig = datadir.join("neon.signal");
if neonsig.is_file() {
return Ok(());
}
let zenithsig = datadir.join("zenith.signal");
if zenithsig.is_file() {
fs::copy(zenithsig, neonsig)?;
}
Ok(())
}
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
/// the connection was established, and the (compressed) size of the basebackup. /// the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_index = ShardIndex {
shard_number: ShardNumber(0),
shard_count: spec.pageserver_conninfo.shard_count,
};
let shard0 = spec let shard0 = spec
.pageserver_conninfo .pageserver_conninfo
.shards .shards
.get(&0) .get(&shard0_index)
.expect("shard 0 connection info missing"); .ok_or_else(|| {
let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0"); anyhow::anyhow!("shard connection info missing for shard {}", shard0_index)
})?;
let shard_index = match spec.pageserver_conninfo.shards.len() as u8 { let pageserver = shard0
0 | 1 => ShardIndex::unsharded(), .pageservers
count => ShardIndex::new(ShardNumber(0), ShardCount(count)), .first()
}; .expect("must have at least one pageserver");
let shard0_url = pageserver
.grpc_url
.clone()
.expect("no grpc_url for shard 0");
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::connect( let mut client = page_api::Client::connect(
shard0_url, shard0_url,
spec.tenant_id, spec.tenant_id,
spec.timeline_id, spec.timeline_id,
shard_index, shard0_index,
spec.storage_auth_token.clone(), spec.storage_auth_token.clone(),
None, // NB: base backups use payload compression None, // NB: base backups use payload compression
) )
@@ -1121,12 +1227,25 @@ impl ComputeNode {
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
/// when the connection was established, and the (compressed) size of the basebackup. /// when the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_index = ShardIndex {
shard_number: ShardNumber(0),
shard_count: spec.pageserver_conninfo.shard_count,
};
let shard0 = spec let shard0 = spec
.pageserver_conninfo .pageserver_conninfo
.shards .shards
.get(&0) .get(&shard0_index)
.expect("shard 0 connection info missing"); .ok_or_else(|| {
let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0"); anyhow::anyhow!("shard connection info missing for shard {}", shard0_index)
})?;
let pageserver = shard0
.pageservers
.first()
.expect("must have at least one pageserver");
let shard0_connstr = pageserver
.libpq_url
.clone()
.expect("no libpq_url for shard 0");
let mut config = postgres::Config::from_str(&shard0_connstr)?; let mut config = postgres::Config::from_str(&shard0_connstr)?;
// Use the storage auth token from the config file, if given. // Use the storage auth token from the config file, if given.
@@ -1286,9 +1405,7 @@ impl ComputeNode {
// In case of error, log and fail the check, but don't crash. // In case of error, log and fail the check, but don't crash.
// We're playing it safe because these errors could be transient // We're playing it safe because these errors could be transient
// and we don't yet retry. Also being careful here allows us to // and we don't yet retry.
// be backwards compatible with safekeepers that don't have the
// TIMELINE_STATUS API yet.
if responses.len() < quorum { if responses.len() < quorum {
error!( error!(
"failed sync safekeepers check {:?} {:?} {:?}", "failed sync safekeepers check {:?} {:?} {:?}",
@@ -1391,6 +1508,7 @@ impl ComputeNode {
self.create_pgdata()?; self.create_pgdata()?;
config::write_postgres_conf( config::write_postgres_conf(
pgdata_path, pgdata_path,
&self.params,
&pspec.spec, &pspec.spec,
self.params.internal_http_port, self.params.internal_http_port,
tls_config, tls_config,
@@ -1731,6 +1849,7 @@ impl ComputeNode {
} }
// Run migrations separately to not hold up cold starts // Run migrations separately to not hold up cold starts
let params = self.params.clone();
tokio::spawn(async move { tokio::spawn(async move {
let mut conf = conf.as_ref().clone(); let mut conf = conf.as_ref().clone();
conf.application_name("compute_ctl:migrations"); conf.application_name("compute_ctl:migrations");
@@ -1742,7 +1861,7 @@ impl ComputeNode {
eprintln!("connection error: {e}"); eprintln!("connection error: {e}");
} }
}); });
if let Err(e) = handle_migrations(&mut client).await { if let Err(e) = handle_migrations(params, &mut client).await {
error!("Failed to run migrations: {}", e); error!("Failed to run migrations: {}", e);
} }
} }
@@ -1821,11 +1940,14 @@ impl ComputeNode {
let pgdata_path = Path::new(&self.params.pgdata); let pgdata_path = Path::new(&self.params.pgdata);
config::write_postgres_conf( config::write_postgres_conf(
pgdata_path, pgdata_path,
&self.params,
&spec, &spec,
self.params.internal_http_port, self.params.internal_http_port,
tls_config, tls_config,
)?; )?;
self.pg_reload_conf()?;
if !spec.skip_pg_catalog_updates { if !spec.skip_pg_catalog_updates {
let max_concurrent_connections = spec.reconfigure_concurrency; let max_concurrent_connections = spec.reconfigure_concurrency;
// Temporarily reset max_cluster_size in config // Temporarily reset max_cluster_size in config
@@ -1845,10 +1967,9 @@ impl ComputeNode {
Ok(()) Ok(())
})?; })?;
self.pg_reload_conf()?;
} }
self.pg_reload_conf()?;
let unknown_op = "unknown".to_string(); let unknown_op = "unknown".to_string();
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
info!( info!(
@@ -1921,7 +2042,8 @@ impl ComputeNode {
// exit loop // exit loop
ComputeStatus::Failed ComputeStatus::Failed
| ComputeStatus::TerminationPending { .. } | ComputeStatus::TerminationPendingFast
| ComputeStatus::TerminationPendingImmediate
| ComputeStatus::Terminated => break 'cert_update, | ComputeStatus::Terminated => break 'cert_update,
// wait // wait
@@ -2087,7 +2209,7 @@ LIMIT 100",
self.params self.params
.remote_ext_base_url .remote_ext_base_url
.as_ref() .as_ref()
.ok_or(DownloadError::BadInput(anyhow!( .ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured", "Remote extensions storage is not configured",
)))?; )))?;
@@ -2283,7 +2405,7 @@ LIMIT 100",
let remote_extensions = spec let remote_extensions = spec
.remote_extensions .remote_extensions
.as_ref() .as_ref()
.ok_or(anyhow!("Remote extensions are not configured"))?; .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
info!("parse shared_preload_libraries from spec.cluster.settings"); info!("parse shared_preload_libraries from spec.cluster.settings");
let mut libs_vec = Vec::new(); let mut libs_vec = Vec::new();
@@ -2431,14 +2553,31 @@ LIMIT 100",
pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) { pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
self.terminate_lfc_offload_task(); self.terminate_lfc_offload_task();
let secs = interval.as_secs(); let secs = interval.as_secs();
info!("spawning lfc offload worker with {secs}s interval");
let this = self.clone(); let this = self.clone();
info!("spawning LFC offload worker with {secs}s interval");
let handle = spawn(async move { let handle = spawn(async move {
let mut interval = time::interval(interval); let mut interval = time::interval(interval);
interval.tick().await; // returns immediately interval.tick().await; // returns immediately
loop { loop {
interval.tick().await; interval.tick().await;
this.offload_lfc_async().await;
let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone();
// Do not offload LFC state if we are currently prewarming or any issue occurred.
// If we'd do that, we might override the LFC state in endpoint storage with some
// incomplete state. Imagine a situation:
// 1. Endpoint started with `autoprewarm: true`
// 2. While prewarming is not completed, we upload the new incomplete state
// 3. Compute gets interrupted and restarts
// 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
if matches!(
prewarm_state,
LfcPrewarmState::Completed
| LfcPrewarmState::NotPrewarmed
| LfcPrewarmState::Skipped
) {
this.offload_lfc_async().await;
}
} }
}); });
*self.lfc_offload_task.lock().unwrap() = Some(handle); *self.lfc_offload_task.lock().unwrap() = Some(handle);
@@ -2455,19 +2594,11 @@ LIMIT 100",
// If the value is -1, we never suspend so set the value to default collection. // If the value is -1, we never suspend so set the value to default collection.
// If the value is 0, it means default, we will just continue to use the default. // If the value is 0, it means default, we will just continue to use the default.
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
info!(
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
);
self.params.installed_extensions_collection_interval.store( self.params.installed_extensions_collection_interval.store(
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
std::sync::atomic::Ordering::SeqCst, std::sync::atomic::Ordering::SeqCst,
); );
} else { } else {
info!(
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
spec.suspend_timeout_seconds
);
self.params.installed_extensions_collection_interval.store( self.params.installed_extensions_collection_interval.store(
spec.suspend_timeout_seconds as u64, spec.suspend_timeout_seconds as u64,
std::sync::atomic::Ordering::SeqCst, std::sync::atomic::Ordering::SeqCst,
@@ -2485,7 +2616,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
serde_json::to_string(&extensions).expect("failed to serialize extensions list") serde_json::to_string(&extensions).expect("failed to serialize extensions list")
); );
} }
Err(err) => error!("could not get installed extensions: {err:?}"), Err(err) => error!("could not get installed extensions: {err}"),
} }
Ok(()) Ok(())
} }
@@ -2598,7 +2729,10 @@ mod tests {
match ParsedSpec::try_from(spec.clone()) { match ParsedSpec::try_from(spec.clone()) {
Ok(_p) => panic!("Failed to detect duplicate entry"), Ok(_p) => panic!("Failed to detect duplicate entry"),
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")), Err(e) => assert!(
e.to_string()
.starts_with("duplicate entry in safekeeper_connstrings:")
),
}; };
} }
} }

View File

@@ -70,7 +70,7 @@ impl ComputeNode {
} }
}; };
let row = match client let row = match client
.query_one("select * from get_prewarm_info()", &[]) .query_one("select * from neon.get_prewarm_info()", &[])
.await .await
{ {
Ok(row) => row, Ok(row) => row,
@@ -89,7 +89,7 @@ impl ComputeNode {
self.state.lock().unwrap().lfc_offload_state.clone() self.state.lock().unwrap().lfc_offload_state.clone()
} }
/// If there is a prewarm request ongoing, return false, true otherwise /// If there is a prewarm request ongoing, return `false`, `true` otherwise.
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool { pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
{ {
let state = &mut self.state.lock().unwrap().lfc_prewarm_state; let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
@@ -101,14 +101,25 @@ impl ComputeNode {
let cloned = self.clone(); let cloned = self.clone();
spawn(async move { spawn(async move {
let Err(err) = cloned.prewarm_impl(from_endpoint).await else { let state = match cloned.prewarm_impl(from_endpoint).await {
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; Ok(true) => LfcPrewarmState::Completed,
return; Ok(false) => {
}; info!(
error!(%err); "skipping LFC prewarm because LFC state is not found in endpoint storage"
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { );
error: err.to_string(), LfcPrewarmState::Skipped
}
Err(err) => {
crate::metrics::LFC_PREWARM_ERRORS.inc();
error!(%err, "could not prewarm LFC");
LfcPrewarmState::Failed {
error: err.to_string(),
}
}
}; };
cloned.state.lock().unwrap().lfc_prewarm_state = state;
}); });
true true
} }
@@ -119,15 +130,21 @@ impl ComputeNode {
EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint) EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
} }
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> { /// Request LFC state from endpoint storage and load corresponding pages into Postgres.
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
info!(%url, "requesting LFC state from endpoint storage");
info!(%url, "requesting LFC state from endpoint storage");
let request = Client::new().get(&url).bearer_auth(token); let request = Client::new().get(&url).bearer_auth(token);
let res = request.send().await.context("querying endpoint storage")?; let res = request.send().await.context("querying endpoint storage")?;
let status = res.status(); let status = res.status();
if status != StatusCode::OK { match status {
bail!("{status} querying endpoint storage") StatusCode::OK => (),
StatusCode::NOT_FOUND => {
return Ok(false);
}
_ => bail!("{status} querying endpoint storage"),
} }
let mut uncompressed = Vec::new(); let mut uncompressed = Vec::new();
@@ -140,15 +157,18 @@ impl ComputeNode {
.await .await
.context("decoding LFC state")?; .context("decoding LFC state")?;
let uncompressed_len = uncompressed.len(); let uncompressed_len = uncompressed.len();
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
ComputeNode::get_maintenance_client(&self.tokio_conn_conf) ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await .await
.context("connecting to postgres")? .context("connecting to postgres")?
.query_one("select prewarm_local_cache($1)", &[&uncompressed]) .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
.await .await
.context("loading LFC state into postgres") .context("loading LFC state into postgres")
.map(|_| ()) .map(|_| ())?;
Ok(true)
} }
/// If offload request is ongoing, return false, true otherwise /// If offload request is ongoing, return false, true otherwise
@@ -176,11 +196,14 @@ impl ComputeNode {
async fn offload_lfc_with_state_update(&self) { async fn offload_lfc_with_state_update(&self) {
crate::metrics::LFC_OFFLOADS.inc(); crate::metrics::LFC_OFFLOADS.inc();
let Err(err) = self.offload_lfc_impl().await else { let Err(err) = self.offload_lfc_impl().await else {
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
return; return;
}; };
error!(%err);
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
error!(%err, "could not offload LFC state to endpoint storage");
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
error: err.to_string(), error: err.to_string(),
}; };
@@ -188,13 +211,13 @@ impl ComputeNode {
async fn offload_lfc_impl(&self) -> Result<()> { async fn offload_lfc_impl(&self) -> Result<()> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
info!(%url, "requesting LFC state from postgres"); info!(%url, "requesting LFC state from Postgres");
let mut compressed = Vec::new(); let mut compressed = Vec::new();
ComputeNode::get_maintenance_client(&self.tokio_conn_conf) ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await .await
.context("connecting to postgres")? .context("connecting to postgres")?
.query_one("select get_local_cache_state()", &[]) .query_one("select neon.get_local_cache_state()", &[])
.await .await
.context("querying LFC state")? .context("querying LFC state")?
.try_get::<usize, &[u8]>(0) .try_get::<usize, &[u8]>(0)
@@ -203,13 +226,17 @@ impl ComputeNode {
.read_to_end(&mut compressed) .read_to_end(&mut compressed)
.await .await
.context("compressing LFC state")?; .context("compressing LFC state")?;
let compressed_len = compressed.len(); let compressed_len = compressed.len();
info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
let request = Client::new().put(url).bearer_auth(token).body(compressed); let request = Client::new().put(url).bearer_auth(token).body(compressed);
match request.send().await { match request.send().await {
Ok(res) if res.status() == StatusCode::OK => Ok(()), Ok(res) if res.status() == StatusCode::OK => Ok(()),
Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), Ok(res) => bail!(
"Request to endpoint storage failed with status: {}",
res.status()
),
Err(err) => Err(err).context("writing to endpoint storage"), Err(err) => Err(err).context("writing to endpoint storage"),
} }
} }

View File

@@ -0,0 +1,132 @@
use crate::compute::ComputeNode;
use anyhow::{Context, Result, bail};
use compute_api::{
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
spec::ComputeMode,
};
use std::{sync::Arc, time::Duration};
use tokio::time::sleep;
use utils::lsn::Lsn;
impl ComputeNode {
/// Returns only when promote fails or succeeds. If a network error occurs
/// and http client disconnects, this does not stop promotion, and subsequent
/// calls block until promote finishes.
/// Called by control plane on secondary after primary endpoint is terminated
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
let cloned = self.clone();
let start_promotion = || {
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
tokio::spawn(async move {
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
Ok(_) => PromoteState::Completed,
Err(err) => {
tracing::error!(%err, "promoting");
PromoteState::Failed {
error: err.to_string(),
}
}
})
});
rx
};
let mut task;
// self.state is unlocked after block ends so we lock it in promote_impl
// and task.changed() is reached
{
task = self
.state
.lock()
.unwrap()
.promote_state
.get_or_insert_with(start_promotion)
.clone()
}
task.changed().await.expect("promote sender dropped");
task.borrow().clone()
}
// Why do we have to supply safekeepers?
// For secondary we use primary_connection_conninfo so safekeepers field is empty
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
{
let state = self.state.lock().unwrap();
let mode = &state.pspec.as_ref().unwrap().spec.mode;
if *mode != ComputeMode::Replica {
bail!("{} is not replica", mode.to_type_str());
}
// we don't need to query Postgres so not self.lfc_prewarm_state()
match &state.lfc_prewarm_state {
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
bail!("prewarm not requested or pending")
}
LfcPrewarmState::Failed { error } => {
tracing::warn!(%error, "replica prewarm failed")
}
_ => {}
}
}
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
last_wal_replay_lsn = lsn.into();
if last_wal_replay_lsn >= primary_lsn {
break;
}
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
sleep(Duration::from_secs(1)).await;
}
if last_wal_replay_lsn < primary_lsn {
bail!("didn't catch up with primary in {RETRIES} retries");
}
// using $1 doesn't work with ALTER SYSTEM SET
let safekeepers_sql = format!(
"ALTER SYSTEM SET neon.safekeepers='{}'",
safekeepers_lsn.safekeepers
);
client
.query(&safekeepers_sql, &[])
.await
.context("setting safekeepers")?;
client
.query("SELECT pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
let row = client
.query_one("SELECT * FROM pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {
bail!("pg_promote() returned false");
}
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let row = client
.query_one("SHOW transaction_read_only", &[])
.await
.context("getting transaction_read_only")?;
if row.get::<usize, &str>(0) == "on" {
bail!("replica in read only mode after promotion");
}
let mut state = self.state.lock().unwrap();
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
Ok(())
}
}

View File

@@ -9,11 +9,14 @@ use std::path::Path;
use compute_api::responses::TlsConfig; use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use crate::compute::ComputeNodeParams;
use crate::pg_helpers::{ use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
}; };
use crate::tls::{self, SERVER_CRT, SERVER_KEY}; use crate::tls::{self, SERVER_CRT, SERVER_KEY};
use utils::shard::{ShardIndex, ShardNumber};
/// Check that `line` is inside a text file and put it there if it is not. /// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist. /// Create file if it doesn't exist.
pub fn line_in_file(path: &Path, line: &str) -> Result<bool> { pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
@@ -41,6 +44,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
/// Create or completely rewrite configuration file specified by `path` /// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf( pub fn write_postgres_conf(
pgdata_path: &Path, pgdata_path: &Path,
params: &ComputeNodeParams,
spec: &ComputeSpec, spec: &ComputeSpec,
extension_server_port: u16, extension_server_port: u16,
tls_config: &Option<TlsConfig>, tls_config: &Option<TlsConfig>,
@@ -56,24 +60,53 @@ pub fn write_postgres_conf(
// Add options for connecting to storage // Add options for connecting to storage
writeln!(file, "# Neon storage settings")?; writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info { if let Some(conninfo) = &spec.pageserver_connection_info {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_conninfo.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new()); let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new()); let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
conninfo.shard_count.0
};
for shardno in 0..conninfo.shards.len() { for shard_number in 0..num_shards {
let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| { let shard_index = ShardIndex {
anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map") shard_number: ShardNumber(shard_number),
shard_count: conninfo.shard_count,
};
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
anyhow::anyhow!(
"shard {shard_index} missing from pageserver_connection_info shard map"
)
})?; })?;
if let Some(url) = &info.libpq_url { let first_pageserver = info
.pageservers
.first()
.expect("must have at least one pageserver");
// Add the libpq URL to the array, or if the URL is missing, reset the array
// forgetting any previous entries. All servers must have a libpq URL, or none
// at all.
if let Some(url) = &first_pageserver.libpq_url {
if let Some(ref mut urls) = libpq_urls { if let Some(ref mut urls) = libpq_urls {
urls.push(url.clone()); urls.push(url.clone());
} }
} else { } else {
libpq_urls = None libpq_urls = None
} }
if let Some(url) = &info.grpc_url { // Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls { if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone()); urls.push(url.clone());
} }
@@ -82,6 +115,10 @@ pub fn write_postgres_conf(
} }
} }
if let Some(libpq_urls) = libpq_urls { if let Some(libpq_urls) = libpq_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!( writeln!(
file, file,
"neon.pageserver_connstring={}", "neon.pageserver_connstring={}",
@@ -91,6 +128,10 @@ pub fn write_postgres_conf(
writeln!(file, "# no neon.pageserver_connstring")?; writeln!(file, "# no neon.pageserver_connstring")?;
} }
if let Some(grpc_urls) = grpc_urls { if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!( writeln!(
file, file,
"neon.pageserver_grpc_urls={}", "neon.pageserver_grpc_urls={}",
@@ -99,11 +140,19 @@ pub fn write_postgres_conf(
} else { } else {
writeln!(file, "# no neon.pageserver_grpc_urls")?; writeln!(file, "# no neon.pageserver_grpc_urls")?;
} }
} else {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "# from compute spec's pageserver_connstring field")?;
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
} }
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() { if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new(); let mut neon_safekeepers_value = String::new();
tracing::info!( tracing::info!(
@@ -203,6 +252,12 @@ pub fn write_postgres_conf(
} }
} }
writeln!(
file,
"neon.privileged_role_name={}",
escape_conf_value(params.privileged_role_name.as_str())
)?;
// If there are any extra options in the 'settings' field, append those // If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() { if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -83,6 +83,87 @@ paths:
schema: schema:
$ref: "#/components/schemas/DbsAndRoles" $ref: "#/components/schemas/DbsAndRoles"
/promote:
post:
tags:
- Promotion
summary: Promote secondary replica to primary
description: ""
operationId: promoteReplica
requestBody:
description: Promote requests data
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/SafekeepersLsn"
responses:
200:
description: Promote succeeded or wasn't started
content:
application/json:
schema:
$ref: "#/components/schemas/PromoteState"
500:
description: Promote failed
content:
application/json:
schema:
$ref: "#/components/schemas/PromoteState"
/lfc/prewarm:
post:
summary: Request LFC Prewarm
parameters:
- name: from_endpoint
in: query
schema:
type: string
description: ""
operationId: lfcPrewarm
responses:
202:
description: LFC prewarm started
429:
description: LFC prewarm ongoing
get:
tags:
- Prewarm
summary: Get LFC prewarm state
description: ""
operationId: getLfcPrewarmState
responses:
200:
description: Prewarm state
content:
application/json:
schema:
$ref: "#/components/schemas/LfcPrewarmState"
/lfc/offload:
post:
summary: Request LFC offload
description: ""
operationId: lfcOffload
responses:
202:
description: LFC offload started
429:
description: LFC offload ongoing
get:
tags:
- Prewarm
summary: Get LFC offloading state
description: ""
operationId: getLfcOffloadState
responses:
200:
description: Offload state
content:
application/json:
schema:
$ref: "#/components/schemas/LfcOffloadState"
/database_schema: /database_schema:
get: get:
tags: tags:
@@ -290,9 +371,28 @@ paths:
summary: Terminate Postgres and wait for it to exit summary: Terminate Postgres and wait for it to exit
description: "" description: ""
operationId: terminate operationId: terminate
parameters:
- name: mode
in: query
description: "Terminate mode: fast (wait 30s before returning) and immediate"
required: false
schema:
type: string
enum: ["fast", "immediate"]
default: fast
responses: responses:
200: 200:
description: Result description: Result
content:
application/json:
schema:
$ref: "#/components/schemas/TerminateResponse"
201:
description: Result if compute is already terminated
content:
application/json:
schema:
$ref: "#/components/schemas/TerminateResponse"
412: 412:
description: "wrong state" description: "wrong state"
content: content:
@@ -335,15 +435,6 @@ components:
total_startup_ms: total_startup_ms:
type: integer type: integer
Info:
type: object
description: Information about VM/Pod.
required:
- num_cpus
properties:
num_cpus:
type: integer
DbsAndRoles: DbsAndRoles:
type: object type: object
description: Databases and Roles description: Databases and Roles
@@ -458,11 +549,14 @@ components:
type: string type: string
enum: enum:
- empty - empty
- init
- failed
- running
- configuration_pending - configuration_pending
- init
- running
- configuration - configuration
- failed
- termination_pending_fast
- termination_pending_immediate
- terminated
example: running example: running
ExtensionInstallRequest: ExtensionInstallRequest:
@@ -497,25 +591,69 @@ components:
type: string type: string
example: "1.0.0" example: "1.0.0"
InstalledExtensions: SafekeepersLsn:
type: object type: object
required:
- safekeepers
- wal_flush_lsn
properties: properties:
extensions: safekeepers:
description: Contains list of installed extensions. description: Primary replica safekeepers
type: array type: string
items: wal_flush_lsn:
type: object description: Primary last WAL flush LSN
properties: type: string
extname:
type: string LfcPrewarmState:
version: type: object
type: string required:
items: - status
type: string - total
n_databases: - prewarmed
type: integer - skipped
owned_by_superuser: properties:
type: integer status:
description: LFC prewarm status
enum: [not_prewarmed, prewarming, completed, failed, skipped]
type: string
error:
description: LFC prewarm error, if any
type: string
total:
description: Total pages processed
type: integer
prewarmed:
description: Total pages prewarmed
type: integer
skipped:
description: Pages processed but not prewarmed
type: integer
LfcOffloadState:
type: object
required:
- status
properties:
status:
description: LFC offload status
enum: [not_offloaded, offloading, completed, failed]
type: string
error:
description: LFC offload error, if any
type: string
PromoteState:
type: object
required:
- status
properties:
status:
description: Promote result
enum: [not_promoted, completed, failed]
type: string
error:
description: Promote error, if any
type: string
SetRoleGrantsRequest: SetRoleGrantsRequest:
type: object type: object
@@ -544,6 +682,17 @@ components:
description: Role name. description: Role name.
example: "neon" example: "neon"
TerminateResponse:
type: object
required:
- lsn
properties:
lsn:
type: string
nullable: true
description: "last WAL flush LSN"
example: "0/028F10D8"
SetRoleGrantsResponse: SetRoleGrantsResponse:
type: object type: object
required: required:

View File

@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
pub(in crate::http) mod lfc; pub(in crate::http) mod lfc;
pub(in crate::http) mod metrics; pub(in crate::http) mod metrics;
pub(in crate::http) mod metrics_json; pub(in crate::http) mod metrics_json;
pub(in crate::http) mod promote;
pub(in crate::http) mod status; pub(in crate::http) mod status;
pub(in crate::http) mod terminate; pub(in crate::http) mod terminate;

View File

@@ -0,0 +1,14 @@
use crate::http::JsonResponse;
use axum::Form;
use http::StatusCode;
pub(in crate::http) async fn promote(
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
) -> axum::response::Response {
let state = compute.promote(safekeepers_lsn).await;
if let compute_api::responses::PromoteState::Failed { error } = state {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
}
JsonResponse::success(StatusCode::OK, state)
}

View File

@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
use axum::extract::State; use axum::extract::State;
use axum::response::Response; use axum::response::Response;
use axum_extra::extract::OptionalQuery; use axum_extra::extract::OptionalQuery;
use compute_api::responses::{ComputeStatus, TerminateResponse}; use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
use http::StatusCode; use http::StatusCode;
use serde::Deserialize; use serde::Deserialize;
use std::sync::Arc; use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;
#[derive(Deserialize, Default)] #[derive(Deserialize, Default)]
pub struct TerminateQuery { pub struct TerminateQuery {
mode: compute_api::responses::TerminateMode, mode: TerminateMode,
} }
/// Terminate the compute. /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
{ {
let mut state = compute.state.lock().unwrap(); let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated { if state.status == ComputeStatus::Terminated {
return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn); let response = TerminateResponse {
lsn: state.terminate_flush_lsn,
};
return JsonResponse::success(StatusCode::CREATED, response);
} }
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status); return JsonResponse::invalid_status(state.status);
} }
state.set_status( state.set_status(mode.into(), &compute.state_changed);
ComputeStatus::TerminationPending { mode },
&compute.state_changed,
);
} }
forward_termination_signal(false); forward_termination_signal(false);

View File

@@ -23,7 +23,7 @@ use super::{
middleware::authorize::Authorize, middleware::authorize::Authorize,
routes::{ routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, lfc, metrics, metrics_json, status, terminate, grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
}, },
}; };
use crate::compute::ComputeNode; use crate::compute::ComputeNode;
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
let authenticated_router = Router::<Arc<ComputeNode>>::new() let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
.route("/promote", post(promote::promote))
.route("/check_writability", post(check_writability::is_writable)) .route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure)) .route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump)) .route("/database_schema", get(database_schema::get_schema_dump))

View File

@@ -2,6 +2,7 @@ use std::collections::HashMap;
use anyhow::Result; use anyhow::Result;
use compute_api::responses::{InstalledExtension, InstalledExtensions}; use compute_api::responses::{InstalledExtension, InstalledExtensions};
use tokio_postgres::error::Error as PostgresError;
use tokio_postgres::{Client, Config, NoTls}; use tokio_postgres::{Client, Config, NoTls};
use crate::metrics::INSTALLED_EXTENSIONS; use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
/// and to make database listing query here more explicit. /// and to make database listing query here more explicit.
/// ///
/// Limit the number of databases to 500 to avoid excessive load. /// Limit the number of databases to 500 to avoid excessive load.
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> { async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
// `pg_database.datconnlimit = -2` means that the database is in the // `pg_database.datconnlimit = -2` means that the database is in the
// invalid state // invalid state
let databases = client let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
/// Same extension can be installed in multiple databases with different versions, /// Same extension can be installed in multiple databases with different versions,
/// so we report a separate metric (number of databases where it is installed) /// so we report a separate metric (number of databases where it is installed)
/// for each extension version. /// for each extension version.
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> { pub async fn get_installed_extensions(
mut conf: Config,
) -> Result<InstalledExtensions, PostgresError> {
conf.application_name("compute_ctl:get_installed_extensions"); conf.application_name("compute_ctl:get_installed_extensions");
let databases: Vec<String> = { let databases: Vec<String> = {
let (mut client, connection) = conf.connect(NoTls).await?; let (mut client, connection) = conf.connect(NoTls).await?;

View File

@@ -12,6 +12,7 @@ pub mod logger;
pub mod catalog; pub mod catalog;
pub mod compute; pub mod compute;
pub mod compute_prewarm; pub mod compute_prewarm;
pub mod compute_promote;
pub mod disk_quota; pub mod disk_quota;
pub mod extension_server; pub mod extension_server;
pub mod installed_extensions; pub mod installed_extensions;

View File

@@ -4,13 +4,13 @@ use std::thread;
use std::time::{Duration, SystemTime}; use std::time::{Duration, SystemTime};
use anyhow::{Result, bail}; use anyhow::{Result, bail};
use compute_api::spec::{ComputeMode, PageserverConnectionInfo}; use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
use pageserver_page_api as page_api; use pageserver_page_api as page_api;
use postgres::{NoTls, SimpleQueryMessage}; use postgres::{NoTls, SimpleQueryMessage};
use tracing::{info, warn}; use tracing::{info, warn};
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use utils::shard::TenantShardId;
use crate::compute::ComputeNode; use crate::compute::ComputeNode;
@@ -116,37 +116,38 @@ fn try_acquire_lsn_lease(
timeline_id: TimelineId, timeline_id: TimelineId,
lsn: Lsn, lsn: Lsn,
) -> Result<Option<SystemTime>> { ) -> Result<Option<SystemTime>> {
let shard_count = conninfo.shards.len();
let mut leases = Vec::new(); let mut leases = Vec::new();
for (shard_number, shard) in conninfo.shards.into_iter() { for (shard_index, shard) in conninfo.shards.into_iter() {
let tenant_shard_id = match shard_count { let tenant_shard_id = TenantShardId {
0 | 1 => TenantShardId::unsharded(tenant_id), tenant_id,
shard_count => TenantShardId { shard_number: shard_index.shard_number,
tenant_id, shard_count: shard_index.shard_count,
shard_number: ShardNumber(shard_number as u8),
shard_count: ShardCount::new(shard_count as u8),
},
}; };
let lease = if conninfo.prefer_grpc { // XXX: If there are more than pageserver for the one shard, do we need to get a
acquire_lsn_lease_grpc( // leas on all of them? Currently, that's what we assume, but this is hypothetical
&shard.grpc_url.unwrap(), // as of this writing, as we never pass the info for more than one pageserver per
auth, // shard.
tenant_shard_id, for pageserver in shard.pageservers {
timeline_id, let lease = match conninfo.prefer_protocol {
lsn, PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
)? &pageserver.grpc_url.unwrap(),
} else { auth,
acquire_lsn_lease_libpq( tenant_shard_id,
&shard.libpq_url.unwrap(), timeline_id,
auth, lsn,
tenant_shard_id, )?,
timeline_id, PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
lsn, &pageserver.libpq_url.unwrap(),
)? auth,
}; tenant_shard_id,
leases.push(lease); timeline_id,
lsn,
)?,
};
leases.push(lease);
}
} }
Ok(leases.into_iter().min().flatten()) Ok(leases.into_iter().min().flatten())

View File

@@ -105,6 +105,14 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric") .expect("failed to define a metric")
}); });
pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"compute_ctl_lfc_prewarm_errors_total",
"Total number of LFC prewarm errors",
)
.expect("failed to define a metric")
});
pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| { pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!( register_int_counter!(
"compute_ctl_lfc_offloads_total", "compute_ctl_lfc_offloads_total",
@@ -113,6 +121,14 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric") .expect("failed to define a metric")
}); });
pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"compute_ctl_lfc_offload_errors_total",
"Total number of LFC offload errors",
)
.expect("failed to define a metric")
});
pub fn collect() -> Vec<MetricFamily> { pub fn collect() -> Vec<MetricFamily> {
let mut metrics = COMPUTE_CTL_UP.collect(); let mut metrics = COMPUTE_CTL_UP.collect();
metrics.extend(INSTALLED_EXTENSIONS.collect()); metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -123,6 +139,8 @@ pub fn collect() -> Vec<MetricFamily> {
metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect());
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
metrics.extend(LFC_PREWARMS.collect()); metrics.extend(LFC_PREWARMS.collect());
metrics.extend(LFC_PREWARM_ERRORS.collect());
metrics.extend(LFC_OFFLOADS.collect()); metrics.extend(LFC_OFFLOADS.collect());
metrics.extend(LFC_OFFLOAD_ERRORS.collect());
metrics metrics
} }

View File

@@ -0,0 +1 @@
ALTER ROLE {privileged_role_name} BYPASSRLS;

View File

@@ -1 +0,0 @@
ALTER ROLE neon_superuser BYPASSRLS;

View File

@@ -1,8 +1,21 @@
-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
-- it was found that BYPASSRLS was being applied to all roles.
--
-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
--
-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
-- keep the migration around for a long time, if not indefinitely, so any
-- cluster can be fixed.
--
-- Branching is the gift that keeps on giving...
DO $$ DO $$
DECLARE DECLARE
role_name text; role_name text;
BEGIN BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
LOOP LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
@@ -10,7 +23,7 @@ BEGIN
FOR role_name IN SELECT rolname FROM pg_roles FOR role_name IN SELECT rolname FROM pg_roles
WHERE WHERE
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';

View File

@@ -1,6 +1,6 @@
DO $$ DO $$
BEGIN BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO neon_superuser'; EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
END IF; END IF;
END $$; END $$;

View File

@@ -1 +0,0 @@
GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;

View File

@@ -0,0 +1 @@
GRANT pg_monitor TO {privileged_role_name} WITH ADMIN OPTION;

View File

@@ -1,4 +1,4 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be -- SKIP: Deemed insufficient for allowing relations created by extensions to be
-- interacted with by neon_superuser without permission issues. -- interacted with by {privileged_role_name} without permission issues.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser; ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name};

View File

@@ -1,4 +1,4 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be -- SKIP: Deemed insufficient for allowing relations created by extensions to be
-- interacted with by neon_superuser without permission issues. -- interacted with by {privileged_role_name} without permission issues.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser; ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name};

View File

@@ -1,3 +1,3 @@
-- SKIP: Moved inline to the handle_grants() functions. -- SKIP: Moved inline to the handle_grants() functions.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name} WITH GRANT OPTION;

View File

@@ -1,3 +1,3 @@
-- SKIP: Moved inline to the handle_grants() functions. -- SKIP: Moved inline to the handle_grants() functions.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name} WITH GRANT OPTION;

View File

@@ -1,7 +1,7 @@
DO $$ DO $$
BEGIN BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
END IF; END IF;
END $$; END $$;

View File

@@ -1 +0,0 @@
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;

View File

@@ -0,0 +1 @@
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO {privileged_role_name};

View File

@@ -0,0 +1 @@
GRANT pg_signal_backend TO {privileged_role_name} WITH ADMIN OPTION;

View File

@@ -7,13 +7,17 @@ BEGIN
INTO monitor INTO monitor
FROM pg_auth_members FROM pg_auth_members
WHERE roleid = 'pg_monitor'::regrole WHERE roleid = 'pg_monitor'::regrole
AND member = 'pg_monitor'::regrole; AND member = 'neon_superuser'::regrole;
IF NOT monitor.member THEN IF monitor IS NULL THEN
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
END IF;
IF monitor.admin IS NULL OR NOT monitor.member THEN
RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor'; RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
END IF; END IF;
IF NOT monitor.admin THEN IF monitor.admin IS NULL OR NOT monitor.admin THEN
RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor'; RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
END IF; END IF;
END $$; END $$;

View File

@@ -0,0 +1,23 @@
DO $$
DECLARE
signal_backend record;
BEGIN
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
admin_option AS admin
INTO signal_backend
FROM pg_auth_members
WHERE roleid = 'pg_signal_backend'::regrole
AND member = 'neon_superuser'::regrole;
IF signal_backend IS NULL THEN
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
END IF;
IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
END IF;
IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
END IF;
END $$;

View File

@@ -84,7 +84,8 @@ impl ComputeMonitor {
if matches!( if matches!(
compute_status, compute_status,
ComputeStatus::Terminated ComputeStatus::Terminated
| ComputeStatus::TerminationPending { .. } | ComputeStatus::TerminationPendingFast
| ComputeStatus::TerminationPendingImmediate
| ComputeStatus::Failed | ComputeStatus::Failed
) { ) {
info!( info!(

View File

@@ -9,6 +9,7 @@ use reqwest::StatusCode;
use tokio_postgres::Client; use tokio_postgres::Client;
use tracing::{error, info, instrument}; use tracing::{error, info, instrument};
use crate::compute::ComputeNodeParams;
use crate::config; use crate::config;
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
use crate::migration::MigrationRunner; use crate::migration::MigrationRunner;
@@ -169,7 +170,7 @@ pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
} }
#[instrument(skip_all)] #[instrument(skip_all)]
pub async fn handle_migrations(client: &mut Client) -> Result<()> { pub async fn handle_migrations(params: ComputeNodeParams, client: &mut Client) -> Result<()> {
info!("handle migrations"); info!("handle migrations");
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -178,24 +179,58 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
// Add new migrations in numerical order. // Add new migrations in numerical order.
let migrations = [ let migrations = [
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), &format!(
include_str!("./migrations/0002-alter_roles.sql"), include_str!("./migrations/0001-add_bypass_rls_to_privileged_role.sql"),
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), privileged_role_name = params.privileged_role_name
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!(
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
), ),
include_str!( &format!(
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" include_str!("./migrations/0002-alter_roles.sql"),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!("./migrations/0003-grant_pg_create_subscription_to_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!("./migrations/0004-grant_pg_monitor_to_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!("./migrations/0005-grant_all_on_tables_to_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!("./migrations/0006-grant_all_on_sequences_to_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!(
"./migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql"
),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!(
"./migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql"
),
privileged_role_name = params.privileged_role_name
), ),
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
include_str!( &format!(
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" include_str!(
"./migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql"
),
privileged_role_name = params.privileged_role_name
), ),
include_str!( &format!(
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" include_str!(
"./migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql"
),
privileged_role_name = params.privileged_role_name
),
&format!(
include_str!("./migrations/0012-grant_pg_signal_backend_to_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
), ),
]; ];

View File

@@ -13,14 +13,14 @@ use tokio_postgres::Client;
use tokio_postgres::error::SqlState; use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState}; use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
use crate::pg_helpers::{ use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
get_existing_roles_async, get_existing_roles_async,
}; };
use crate::spec_apply::ApplySpecPhase::{ use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser, CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon, CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase, RunInEachDatabase,
@@ -49,6 +49,7 @@ impl ComputeNode {
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
let client = Self::get_maintenance_client(&conf).await?; let client = Self::get_maintenance_client(&conf).await?;
let spec = spec.clone(); let spec = spec.clone();
let params = Arc::new(self.params.clone());
let databases = get_existing_dbs_async(&client).await?; let databases = get_existing_dbs_async(&client).await?;
let roles = get_existing_roles_async(&client) let roles = get_existing_roles_async(&client)
@@ -157,6 +158,7 @@ impl ComputeNode {
let conf = Arc::new(conf); let conf = Arc::new(conf);
let fut = Self::apply_spec_sql_db( let fut = Self::apply_spec_sql_db(
params.clone(),
spec.clone(), spec.clone(),
conf, conf,
ctx.clone(), ctx.clone(),
@@ -185,7 +187,7 @@ impl ComputeNode {
} }
for phase in [ for phase in [
CreateNeonSuperuser, CreatePrivilegedRole,
DropInvalidDatabases, DropInvalidDatabases,
RenameRoles, RenameRoles,
CreateAndAlterRoles, CreateAndAlterRoles,
@@ -195,6 +197,7 @@ impl ComputeNode {
] { ] {
info!("Applying phase {:?}", &phase); info!("Applying phase {:?}", &phase);
apply_operations( apply_operations(
params.clone(),
spec.clone(), spec.clone(),
ctx.clone(), ctx.clone(),
jwks_roles.clone(), jwks_roles.clone(),
@@ -243,6 +246,7 @@ impl ComputeNode {
} }
let fut = Self::apply_spec_sql_db( let fut = Self::apply_spec_sql_db(
params.clone(),
spec.clone(), spec.clone(),
conf, conf,
ctx.clone(), ctx.clone(),
@@ -293,6 +297,7 @@ impl ComputeNode {
for phase in phases { for phase in phases {
debug!("Applying phase {:?}", &phase); debug!("Applying phase {:?}", &phase);
apply_operations( apply_operations(
params.clone(),
spec.clone(), spec.clone(),
ctx.clone(), ctx.clone(),
jwks_roles.clone(), jwks_roles.clone(),
@@ -313,7 +318,9 @@ impl ComputeNode {
/// May opt to not connect to databases that don't have any scheduled /// May opt to not connect to databases that don't have any scheduled
/// operations. The function is concurrency-controlled with the provided /// operations. The function is concurrency-controlled with the provided
/// semaphore. The caller has to make sure the semaphore isn't exhausted. /// semaphore. The caller has to make sure the semaphore isn't exhausted.
#[allow(clippy::too_many_arguments)] // TODO: needs bigger refactoring
async fn apply_spec_sql_db( async fn apply_spec_sql_db(
params: Arc<ComputeNodeParams>,
spec: Arc<ComputeSpec>, spec: Arc<ComputeSpec>,
conf: Arc<tokio_postgres::Config>, conf: Arc<tokio_postgres::Config>,
ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>, ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
@@ -328,6 +335,7 @@ impl ComputeNode {
for subphase in subphases { for subphase in subphases {
apply_operations( apply_operations(
params.clone(),
spec.clone(), spec.clone(),
ctx.clone(), ctx.clone(),
jwks_roles.clone(), jwks_roles.clone(),
@@ -467,7 +475,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum ApplySpecPhase { pub enum ApplySpecPhase {
CreateNeonSuperuser, CreatePrivilegedRole,
DropInvalidDatabases, DropInvalidDatabases,
RenameRoles, RenameRoles,
CreateAndAlterRoles, CreateAndAlterRoles,
@@ -510,6 +518,7 @@ pub struct MutableApplyContext {
/// - No timeouts have (yet) been implemented. /// - No timeouts have (yet) been implemented.
/// - The caller is responsible for limiting and/or applying concurrency. /// - The caller is responsible for limiting and/or applying concurrency.
pub async fn apply_operations<'a, Fut, F>( pub async fn apply_operations<'a, Fut, F>(
params: Arc<ComputeNodeParams>,
spec: Arc<ComputeSpec>, spec: Arc<ComputeSpec>,
ctx: Arc<RwLock<MutableApplyContext>>, ctx: Arc<RwLock<MutableApplyContext>>,
jwks_roles: Arc<HashSet<String>>, jwks_roles: Arc<HashSet<String>>,
@@ -527,7 +536,7 @@ where
debug!("Processing phase {:?}", &apply_spec_phase); debug!("Processing phase {:?}", &apply_spec_phase);
let ctx = ctx; let ctx = ctx;
let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase) let mut ops = get_operations(&params, &spec, &ctx, &jwks_roles, &apply_spec_phase)
.await? .await?
.peekable(); .peekable();
@@ -588,14 +597,18 @@ where
/// sort/merge/batch execution, but for now this is a nice way to improve /// sort/merge/batch execution, but for now this is a nice way to improve
/// batching behavior of the commands. /// batching behavior of the commands.
async fn get_operations<'a>( async fn get_operations<'a>(
params: &'a ComputeNodeParams,
spec: &'a ComputeSpec, spec: &'a ComputeSpec,
ctx: &'a RwLock<MutableApplyContext>, ctx: &'a RwLock<MutableApplyContext>,
jwks_roles: &'a HashSet<String>, jwks_roles: &'a HashSet<String>,
apply_spec_phase: &'a ApplySpecPhase, apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> { ) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase { match apply_spec_phase {
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation { ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(), query: format!(
include_str!("sql/create_privileged_role.sql"),
privileged_role_name = params.privileged_role_name
),
comment: None, comment: None,
}))), }))),
ApplySpecPhase::DropInvalidDatabases => { ApplySpecPhase::DropInvalidDatabases => {
@@ -697,8 +710,9 @@ async fn get_operations<'a>(
None => { None => {
let query = if !jwks_roles.contains(role.name.as_str()) { let query = if !jwks_roles.contains(role.name.as_str()) {
format!( format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}", "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE {} {}",
role.name.pg_quote(), role.name.pg_quote(),
params.privileged_role_name,
role.to_pg_options(), role.to_pg_options(),
) )
} else { } else {
@@ -849,8 +863,9 @@ async fn get_operations<'a>(
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html) // (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!( query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", "GRANT ALL PRIVILEGES ON DATABASE {} TO {}",
db.name.pg_quote() db.name.pg_quote(),
params.privileged_role_name
), ),
comment: None, comment: None,
}, },

View File

@@ -1,8 +0,0 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -0,0 +1,8 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
THEN
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -8,10 +8,10 @@ code changes locally, but not suitable for running production systems.
## Example: Start with Postgres 16 ## Example: Start with Postgres 16
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands.
```shell ```shell
cargo neon init --pg-version 16 cargo neon init
cargo neon start cargo neon start
cargo neon tenant create --set-default --pg-version 16 cargo neon tenant create --set-default --pg-version 16
cargo neon endpoint create main --pg-version 16 cargo neon endpoint create main --pg-version 16

View File

@@ -16,9 +16,14 @@ use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail}; use anyhow::{Context, Result, anyhow, bail};
use clap::Parser; use clap::Parser;
use compute_api::requests::ComputeClaimsScope; use compute_api::requests::ComputeClaimsScope;
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo}; use compute_api::spec::{
ComputeMode, PageserverConnectionInfo, PageserverProtocol, PageserverShardInfo,
};
use control_plane::broker::StorageBroker; use control_plane::broker::StorageBroker;
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
use control_plane::endpoint::{
pageserver_conf_to_shard_conn_info, tenant_locate_response_to_conn_info,
};
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
use control_plane::local_env; use control_plane::local_env;
use control_plane::local_env::{ use control_plane::local_env::{
@@ -44,7 +49,6 @@ use pageserver_api::models::{
}; };
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType; use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
use safekeeper_api::{ use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
@@ -52,11 +56,11 @@ use safekeeper_api::{
}; };
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use tokio::task::JoinSet; use tokio::task::JoinSet;
use url::Host;
use utils::auth::{Claims, Scope}; use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
use utils::project_git_version; use utils::project_git_version;
use utils::shard::ShardIndex;
// Default id of a safekeeper node, if not specified on the command line. // Default id of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
@@ -631,6 +635,10 @@ struct EndpointCreateCmdArgs {
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
)] )]
allow_multiple: bool, allow_multiple: bool,
/// Only allow changing it on creation
#[clap(long, help = "Name of the privileged role for the endpoint")]
privileged_role_name: Option<String>,
} }
#[derive(clap::Args)] #[derive(clap::Args)]
@@ -1480,6 +1488,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
args.grpc, args.grpc,
!args.update_catalog, !args.update_catalog,
false, false,
args.privileged_role_name.clone(),
)?; )?;
} }
EndpointCmd::Start(args) => { EndpointCmd::Start(args) => {
@@ -1516,74 +1525,56 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
)?; )?;
} }
let (shards, stripe_size) = if let Some(ps_id) = pageserver_id { let prefer_protocol = if endpoint.grpc {
let conf = env.get_pageserver_conf(ps_id).unwrap(); PageserverProtocol::Grpc
let libpq_url = Some({ } else {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?; PageserverProtocol::Libpq
let port = port.unwrap_or(5432); };
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id {
let conf = env.get_pageserver_conf(ps_id).unwrap();
let ps_conninfo = pageserver_conf_to_shard_conn_info(conf)?;
let shard_info = PageserverShardInfo {
pageservers: vec![ps_conninfo],
};
// If caller is telling us what pageserver to use, this is not a tenant which is // If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded. // fully managed by storage controller, therefore not sharded.
(vec![(0, pageserver)], DEFAULT_STRIPE_SIZE) let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
.into_iter()
.collect();
PageserverConnectionInfo {
shard_count: ShardCount(0),
stripe_size: None,
shards,
prefer_protocol,
}
} else { } else {
// Look up the currently attached location of the tenant, and its striping metadata, // Look up the currently attached location of the tenant, and its striping metadata,
// to pass these on to postgres. // to pass these on to postgres.
let storage_controller = StorageController::from_env(env); let storage_controller = StorageController::from_env(env);
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
let shards = futures::future::try_join_all(locate_result.shards.into_iter().map( assert!(!locate_result.shards.is_empty());
|shard| async move {
if let ComputeMode::Static(lsn) = endpoint.mode { // Initialize LSN leases for static computes.
// Initialize LSN leases for static computes. if let ComputeMode::Static(lsn) = endpoint.mode {
futures::future::try_join_all(locate_result.shards.iter().map(
|shard| async move {
let conf = env.get_pageserver_conf(shard.node_id).unwrap(); let conf = env.get_pageserver_conf(shard.node_id).unwrap();
let pageserver = PageServerNode::from_env(env, conf); let pageserver = PageServerNode::from_env(env, conf);
pageserver pageserver
.http_client .http_client
.timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn) .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
.await?; .await
} },
))
.await?;
}
let libpq_host = Host::parse(&shard.listen_pg_addr)?; tenant_locate_response_to_conn_info(&locate_result)?
let libpq_port = shard.listen_pg_port;
let libpq_url =
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
},
))
.await?;
let stripe_size = locate_result.shard_params.stripe_size;
(shards, stripe_size)
};
assert!(!shards.is_empty());
let pageserver_conninfo = PageserverConnectionInfo {
shards: shards.into_iter().collect(),
prefer_grpc: endpoint.grpc,
}; };
pageserver_conninfo.prefer_protocol = prefer_protocol;
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?; let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1615,7 +1606,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
safekeepers, safekeepers,
pageserver_conninfo, pageserver_conninfo,
remote_ext_base_url: remote_ext_base_url.clone(), remote_ext_base_url: remote_ext_base_url.clone(),
shard_stripe_size: stripe_size.0 as usize,
create_test_user: args.create_test_user, create_test_user: args.create_test_user,
start_timeout: args.start_timeout, start_timeout: args.start_timeout,
autoprewarm: args.autoprewarm, autoprewarm: args.autoprewarm,
@@ -1632,66 +1622,45 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.endpoints .endpoints
.get(endpoint_id.as_str()) .get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
let prefer_protocol = if endpoint.grpc {
PageserverProtocol::Grpc
} else {
PageserverProtocol::Libpq
};
let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id {
let conf = env.get_pageserver_conf(ps_id)?; let conf = env.get_pageserver_conf(ps_id)?;
let libpq_url = Some({ let ps_conninfo = pageserver_conf_to_shard_conn_info(conf)?;
let (host, port) = parse_host_port(&conf.listen_pg_addr)?; let shard_info = PageserverShardInfo {
let port = port.unwrap_or(5432); pageservers: vec![ps_conninfo],
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
}; };
// If caller is telling us what pageserver to use, this is not a tenant which is // If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded. // fully managed by storage controller, therefore not sharded.
vec![(0, pageserver)] let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
} else {
let storage_controller = StorageController::from_env(env);
storage_controller
.tenant_locate(endpoint.tenant_id)
.await?
.shards
.into_iter() .into_iter()
.map(|shard| { .collect();
// Use gRPC if requested. PageserverConnectionInfo {
let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname"); shard_count: ShardCount::unsharded(),
let libpq_port = shard.listen_pg_port; stripe_size: None,
let libpq_url = shards,
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}")); prefer_protocol,
}
} else {
// Look up the currently attached location of the tenant, and its striping metadata,
// to pass these on to postgres.
let storage_controller = StorageController::from_env(env);
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr { tenant_locate_response_to_conn_info(&locate_result)?
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
} else {
None
};
(
shard.shard_id.shard_number.0 as u32,
PageserverShardConnectionInfo {
libpq_url,
grpc_url,
},
)
})
.collect::<Vec<_>>()
};
let pageserver_conninfo = PageserverConnectionInfo {
shards: shards.into_iter().collect(),
prefer_grpc: endpoint.grpc,
}; };
pageserver_conninfo.prefer_protocol = prefer_protocol;
// If --safekeepers argument is given, use only the listed // If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env. // safekeeper nodes; otherwise all from the env.
let safekeepers = parse_safekeepers(&args.safekeepers)?; let safekeepers = parse_safekeepers(&args.safekeepers)?;
endpoint endpoint
.reconfigure(Some(pageserver_conninfo), None, safekeepers, None) .reconfigure(Some(&pageserver_conninfo), safekeepers, None)
.await?; .await?;
} }
EndpointCmd::Stop(args) => { EndpointCmd::Stop(args) => {

View File

@@ -36,7 +36,7 @@ impl StorageBroker {
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
let broker = &self.env.broker; let broker = &self.env.broker;
print!("Starting neon broker at {}", broker.client_url()); println!("Starting neon broker at {}", broker.client_url());
let mut args = Vec::new(); let mut args = Vec::new();

View File

@@ -32,11 +32,12 @@
//! config.json - passed to `compute_ctl` //! config.json - passed to `compute_ctl`
//! pgdata/ //! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl` //! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal //! neon.signal
//! zenith.signal - copy of neon.signal, for backward compatibility
//! <other PostgreSQL files> //! <other PostgreSQL files>
//! ``` //! ```
//! //!
use std::collections::BTreeMap; use std::collections::{BTreeMap, HashMap};
use std::fmt::Display; use std::fmt::Display;
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
use std::path::PathBuf; use std::path::PathBuf;
@@ -56,8 +57,8 @@ use compute_api::responses::{
TlsConfig, TlsConfig,
}; };
use compute_api::spec::{ use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
RemoteExtSpec, Role, PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
}; };
// re-export these, because they're used in the reconfigure() function // re-export these, because they're used in the reconfigure() function
@@ -68,7 +69,6 @@ use jsonwebtoken::jwk::{
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse, OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
}; };
use nix::sys::signal::{Signal, kill}; use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use pem::Pem; use pem::Pem;
use reqwest::header::CONTENT_TYPE; use reqwest::header::CONTENT_TYPE;
use safekeeper_api::PgMajorVersion; use safekeeper_api::PgMajorVersion;
@@ -79,6 +79,10 @@ use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef}; use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug; use tracing::debug;
use utils::id::{NodeId, TenantId, TimelineId}; use utils::id::{NodeId, TenantId, TimelineId};
use utils::shard::{ShardIndex, ShardNumber};
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
use postgres_connection::parse_host_port;
use crate::local_env::LocalEnv; use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf; use crate::postgresql_conf::PostgresConf;
@@ -101,6 +105,7 @@ pub struct EndpointConf {
features: Vec<ComputeFeature>, features: Vec<ComputeFeature>,
cluster: Option<Cluster>, cluster: Option<Cluster>,
compute_ctl_config: ComputeCtlConfig, compute_ctl_config: ComputeCtlConfig,
privileged_role_name: Option<String>,
} }
// //
@@ -201,6 +206,7 @@ impl ComputeControlPlane {
grpc: bool, grpc: bool,
skip_pg_catalog_updates: bool, skip_pg_catalog_updates: bool,
drop_subscriptions_before_start: bool, drop_subscriptions_before_start: bool,
privileged_role_name: Option<String>,
) -> Result<Arc<Endpoint>> { ) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -238,6 +244,7 @@ impl ComputeControlPlane {
features: vec![], features: vec![],
cluster: None, cluster: None,
compute_ctl_config: compute_ctl_config.clone(), compute_ctl_config: compute_ctl_config.clone(),
privileged_role_name: privileged_role_name.clone(),
}); });
ep.create_endpoint_dir()?; ep.create_endpoint_dir()?;
@@ -259,6 +266,7 @@ impl ComputeControlPlane {
features: vec![], features: vec![],
cluster: None, cluster: None,
compute_ctl_config, compute_ctl_config,
privileged_role_name,
})?, })?,
)?; )?;
std::fs::write( std::fs::write(
@@ -334,6 +342,9 @@ pub struct Endpoint {
/// The compute_ctl config for the endpoint's compute. /// The compute_ctl config for the endpoint's compute.
compute_ctl_config: ComputeCtlConfig, compute_ctl_config: ComputeCtlConfig,
/// The name of the privileged role for the endpoint.
privileged_role_name: Option<String>,
} }
#[derive(PartialEq, Eq)] #[derive(PartialEq, Eq)]
@@ -384,7 +395,6 @@ pub struct EndpointStartArgs {
pub safekeepers: Vec<NodeId>, pub safekeepers: Vec<NodeId>,
pub pageserver_conninfo: PageserverConnectionInfo, pub pageserver_conninfo: PageserverConnectionInfo,
pub remote_ext_base_url: Option<String>, pub remote_ext_base_url: Option<String>,
pub shard_stripe_size: usize,
pub create_test_user: bool, pub create_test_user: bool,
pub start_timeout: Duration, pub start_timeout: Duration,
pub autoprewarm: bool, pub autoprewarm: bool,
@@ -434,6 +444,7 @@ impl Endpoint {
features: conf.features, features: conf.features,
cluster: conf.cluster, cluster: conf.cluster,
compute_ctl_config: conf.compute_ctl_config, compute_ctl_config: conf.compute_ctl_config,
privileged_role_name: conf.privileged_role_name,
}) })
} }
@@ -466,7 +477,7 @@ impl Endpoint {
conf.append("max_connections", "100"); conf.append("max_connections", "100");
conf.append("wal_level", "logical"); conf.append("wal_level", "logical");
// wal_sender_timeout is the maximum time to wait for WAL replication. // wal_sender_timeout is the maximum time to wait for WAL replication.
// It also defines how often the walreciever will send a feedback message to the wal sender. // It also defines how often the walreceiver will send a feedback message to the wal sender.
conf.append("wal_sender_timeout", "5s"); conf.append("wal_sender_timeout", "5s");
conf.append("listen_addresses", &self.pg_address.ip().to_string()); conf.append("listen_addresses", &self.pg_address.ip().to_string());
conf.append("port", &self.pg_address.port().to_string()); conf.append("port", &self.pg_address.port().to_string());
@@ -715,6 +726,46 @@ impl Endpoint {
remote_extensions = None; remote_extensions = None;
}; };
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
//
// XXX: I believe this is not really needed, except to make
// test_forward_compatibility happy.
//
// Use a closure so that we can conviniently return None in the middle of the
// loop.
let pageserver_connstring = (|| {
let num_shards = if args.pageserver_conninfo.shard_count.is_unsharded() {
1
} else {
args.pageserver_conninfo.shard_count.0
};
let mut connstrings = Vec::new();
for shard_no in 0..num_shards {
let shard_index = ShardIndex {
shard_count: args.pageserver_conninfo.shard_count,
shard_number: ShardNumber(shard_no),
};
let shard = args
.pageserver_conninfo
.shards
.get(&shard_index)
.expect(&format!(
"shard {} not found in pageserver_connection_info",
shard_index
));
let pageserver = shard
.pageservers
.first()
.expect("must have at least one pageserver");
if let Some(libpq_url) = &pageserver.libpq_url {
connstrings.push(libpq_url.clone());
} else {
return None;
}
}
Some(connstrings.join(","))
})();
// Create config file // Create config file
let config = { let config = {
let mut spec = ComputeSpec { let mut spec = ComputeSpec {
@@ -759,13 +810,14 @@ impl Endpoint {
branch_id: None, branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()), endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode, mode: self.mode,
pageserver_connection_info: Some(args.pageserver_conninfo), pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
pageserver_connstring,
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()), safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings, safekeeper_connstrings,
storage_auth_token: args.auth_token.clone(), storage_auth_token: args.auth_token.clone(),
remote_extensions, remote_extensions,
pgbouncer_settings: None, pgbouncer_settings: None,
shard_stripe_size: Some(args.shard_stripe_size), shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
local_proxy_config: None, local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency, reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start, drop_subscriptions_before_start: self.drop_subscriptions_before_start,
@@ -861,6 +913,10 @@ impl Endpoint {
cmd.arg("--dev"); cmd.arg("--dev");
} }
if let Some(privileged_role_name) = self.privileged_role_name.clone() {
cmd.args(["--privileged-role-name", &privileged_role_name]);
}
let child = cmd.spawn()?; let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below // set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| { let child = scopeguard::guard(child, |mut child| {
@@ -914,7 +970,8 @@ impl Endpoint {
ComputeStatus::Empty ComputeStatus::Empty
| ComputeStatus::ConfigurationPending | ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration | ComputeStatus::Configuration
| ComputeStatus::TerminationPending { .. } | ComputeStatus::TerminationPendingFast
| ComputeStatus::TerminationPendingImmediate
| ComputeStatus::Terminated => { | ComputeStatus::Terminated => {
bail!("unexpected compute status: {:?}", state.status) bail!("unexpected compute status: {:?}", state.status)
} }
@@ -972,8 +1029,7 @@ impl Endpoint {
pub async fn reconfigure( pub async fn reconfigure(
&self, &self,
pageserver_conninfo: Option<PageserverConnectionInfo>, pageserver_conninfo: Option<&PageserverConnectionInfo>,
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>, safekeepers: Option<Vec<NodeId>>,
safekeeper_generation: Option<SafekeeperGeneration>, safekeeper_generation: Option<SafekeeperGeneration>,
) -> Result<()> { ) -> Result<()> {
@@ -995,10 +1051,8 @@ impl Endpoint {
!pageserver_conninfo.shards.is_empty(), !pageserver_conninfo.shards.is_empty(),
"no pageservers provided" "no pageservers provided"
); );
spec.pageserver_connection_info = Some(pageserver_conninfo); spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
} spec.shard_stripe_size = pageserver_conninfo.stripe_size;
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
} }
// If safekeepers are not specified, don't change them. // If safekeepers are not specified, don't change them.
@@ -1047,11 +1101,9 @@ impl Endpoint {
pub async fn reconfigure_pageservers( pub async fn reconfigure_pageservers(
&self, &self,
pageservers: PageserverConnectionInfo, pageservers: &PageserverConnectionInfo,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> { ) -> Result<()> {
self.reconfigure(Some(pageservers), stripe_size, None, None) self.reconfigure(Some(pageservers), None, None).await
.await
} }
pub async fn reconfigure_safekeepers( pub async fn reconfigure_safekeepers(
@@ -1059,7 +1111,7 @@ impl Endpoint {
safekeepers: Vec<NodeId>, safekeepers: Vec<NodeId>,
generation: SafekeeperGeneration, generation: SafekeeperGeneration,
) -> Result<()> { ) -> Result<()> {
self.reconfigure(None, None, Some(safekeepers), Some(generation)) self.reconfigure(None, Some(safekeepers), Some(generation))
.await .await
} }
@@ -1115,3 +1167,68 @@ impl Endpoint {
) )
} }
} }
pub fn pageserver_conf_to_shard_conn_info(
conf: &crate::local_env::PageServerConf,
) -> Result<PageserverShardConnectionInfo> {
let libpq_url = {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
Ok(PageserverShardConnectionInfo {
id: Some(conf.id.to_string()),
libpq_url,
grpc_url,
})
}
pub fn tenant_locate_response_to_conn_info(
response: &pageserver_api::controller_api::TenantLocateResponse,
) -> Result<PageserverConnectionInfo> {
let mut shards = HashMap::new();
for shard in response.shards.iter() {
tracing::info!("parsing {}", shard.listen_pg_addr);
let libpq_url = {
let host = &shard.listen_pg_addr;
let port = shard.listen_pg_port;
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
let host = grpc_addr;
let port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let shard_info = PageserverShardInfo {
pageservers: vec![PageserverShardConnectionInfo {
id: Some(shard.node_id.to_string()),
libpq_url,
grpc_url,
}],
};
shards.insert(shard.shard_id.to_index(), shard_info);
}
let stripe_size = if response.shard_params.count.is_unsharded() {
None
} else {
Some(response.shard_params.stripe_size.0)
};
Ok(PageserverConnectionInfo {
shard_count: response.shard_params.count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::default(),
})
}

View File

@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
pub posthog_config: Option<PostHogConfig>, pub posthog_config: Option<PostHogConfig>,
pub kick_secondary_downloads: Option<bool>, pub kick_secondary_downloads: Option<bool>,
#[serde(with = "humantime_serde")]
pub shard_split_request_timeout: Option<Duration>,
} }
impl NeonStorageControllerConf { impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
timeline_safekeeper_count: None, timeline_safekeeper_count: None,
posthog_config: None, posthog_config: None,
kick_secondary_downloads: None, kick_secondary_downloads: None,
shard_split_request_timeout: None,
} }
} }
} }

View File

@@ -303,7 +303,7 @@ impl PageServerNode {
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> { async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status() // TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path(); let datadir = self.repo_path();
print!( println!(
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}", "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
self.conf.id, self.conf.id,
self.pg_connection_config.raw_address(), self.pg_connection_config.raw_address(),
@@ -452,6 +452,12 @@ impl PageServerNode {
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose() .transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?, .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
// HADRON
image_layer_force_creation_period: settings
.remove("image_layer_force_creation_period")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'image_layer_force_creation_period' as duration")?,
image_layer_creation_check_threshold: settings image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold") .remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>()) .map(|x| x.parse::<u8>())

View File

@@ -127,7 +127,7 @@ impl SafekeeperNode {
extra_opts: &[String], extra_opts: &[String],
retry_timeout: &Duration, retry_timeout: &Duration,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
print!( println!(
"Starting safekeeper at '{}' in '{}', retrying for {:?}", "Starting safekeeper at '{}' in '{}', retrying for {:?}",
self.pg_connection_config.raw_address(), self.pg_connection_config.raw_address(),
self.datadir_path().display(), self.datadir_path().display(),

View File

@@ -648,6 +648,13 @@ impl StorageController {
args.push(format!("--timeline-safekeeper-count={sk_cnt}")); args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
} }
if let Some(duration) = self.config.shard_split_request_timeout {
args.push(format!(
"--shard-split-request-timeout={}",
humantime::Duration::from(duration)
));
}
let mut envs = vec![ let mut envs = vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
@@ -660,7 +667,7 @@ impl StorageController {
)); ));
} }
println!("Starting storage controller"); println!("Starting storage controller at {scheme}://{host}:{listen_port}");
background_process::start_process( background_process::start_process(
COMMAND, COMMAND,

View File

@@ -14,6 +14,7 @@ humantime.workspace = true
pageserver_api.workspace = true pageserver_api.workspace = true
pageserver_client.workspace = true pageserver_client.workspace = true
reqwest.workspace = true reqwest.workspace = true
safekeeper_api.workspace=true
serde_json = { workspace = true, features = ["raw_value"] } serde_json = { workspace = true, features = ["raw_value"] }
storage_controller_client.workspace = true storage_controller_client.workspace = true
tokio.workspace = true tokio.workspace = true

View File

@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
}; };
use pageserver_api::models::{ use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::{self}; use pageserver_client::mgmt_api::{self};
use reqwest::{Certificate, Method, StatusCode, Url}; use reqwest::{Certificate, Method, StatusCode, Url};
use safekeeper_api::models::TimelineLocateResponse;
use storage_controller_client::control_api::Client; use storage_controller_client::control_api::Client;
use utils::id::{NodeId, TenantId, TimelineId}; use utils::id::{NodeId, TenantId, TimelineId};
@@ -75,6 +76,12 @@ enum Command {
NodeStartDelete { NodeStartDelete {
#[arg(long)] #[arg(long)]
node_id: NodeId, node_id: NodeId,
/// When `force` is true, skip waiting for shards to prewarm during migration.
/// This can significantly speed up node deletion since prewarming all shards
/// can take considerable time, but may result in slower initial access to
/// migrated shards until they warm up naturally.
#[arg(long)]
force: bool,
}, },
/// Cancel deletion of the specified pageserver and wait for `timeout` /// Cancel deletion of the specified pageserver and wait for `timeout`
/// for the operation to be canceled. May be retried. /// for the operation to be canceled. May be retried.
@@ -279,6 +286,23 @@ enum Command {
#[arg(long)] #[arg(long)]
concurrency: Option<usize>, concurrency: Option<usize>,
}, },
/// Locate safekeepers for a timeline from the storcon DB.
TimelineLocate {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
timeline_id: TimelineId,
},
/// Migrate a timeline to a new set of safekeepers
TimelineSafekeeperMigrate {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
timeline_id: TimelineId,
/// Example: --new-sk-set 1,2,3
#[arg(long, required = true, value_delimiter = ',')]
new_sk_set: Vec<NodeId>,
},
} }
#[derive(Parser)] #[derive(Parser)]
@@ -458,6 +482,7 @@ async fn main() -> anyhow::Result<()> {
listen_http_port, listen_http_port,
listen_https_port, listen_https_port,
availability_zone_id: AvailabilityZone(availability_zone_id), availability_zone_id: AvailabilityZone(availability_zone_id),
node_ip_addr: None,
}), }),
) )
.await?; .await?;
@@ -933,13 +958,14 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
.await?; .await?;
} }
Command::NodeStartDelete { node_id } => { Command::NodeStartDelete { node_id, force } => {
let query = if force {
format!("control/v1/node/{node_id}/delete?force=true")
} else {
format!("control/v1/node/{node_id}/delete")
};
storcon_client storcon_client
.dispatch::<(), ()>( .dispatch::<(), ()>(Method::PUT, query, None)
Method::PUT,
format!("control/v1/node/{node_id}/delete"),
None,
)
.await?; .await?;
println!("Delete started for {node_id}"); println!("Delete started for {node_id}");
} }
@@ -1324,7 +1350,7 @@ async fn main() -> anyhow::Result<()> {
concurrency, concurrency,
} => { } => {
let mut path = format!( let mut path = format!(
"/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
); );
if let Some(c) = concurrency { if let Some(c) = concurrency {
@@ -1335,6 +1361,41 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<(), ()>(Method::POST, path, None) .dispatch::<(), ()>(Method::POST, path, None)
.await?; .await?;
} }
Command::TimelineLocate {
tenant_id,
timeline_id,
} => {
let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
let resp = storcon_client
.dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
.await?;
let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
let new_sk_set = resp
.new_sk_set
.as_ref()
.map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
println!("generation = {}", resp.generation);
println!("sk_set = {sk_set:?}");
println!("new_sk_set = {new_sk_set:?}");
}
Command::TimelineSafekeeperMigrate {
tenant_id,
timeline_id,
new_sk_set,
} => {
let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
storcon_client
.dispatch::<_, ()>(
Method::POST,
path,
Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
)
.await?;
}
} }
Ok(()) Ok(())

View File

@@ -54,14 +54,16 @@ else
printf '%s\n' "${result}" | jq . printf '%s\n' "${result}" | jq .
fi fi
echo "Check if a timeline present" if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then
PARAMS=( echo "Check if a timeline present"
-X GET PARAMS=(
-H "Content-Type: application/json" -X GET
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline" -H "Content-Type: application/json"
) "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) )
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
fi
if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then
generate_id timeline_id generate_id timeline_id
PARAMS=( PARAMS=(
-sbf -sbf

View File

@@ -142,7 +142,7 @@ services:
- "storage_broker" - "storage_broker"
- "--listen-addr=0.0.0.0:50051" - "--listen-addr=0.0.0.0:50051"
compute: compute1:
restart: always restart: always
build: build:
context: ./compute_wrapper/ context: ./compute_wrapper/
@@ -152,6 +152,7 @@ services:
- TAG=${COMPUTE_TAG:-${TAG:-latest}} - TAG=${COMPUTE_TAG:-${TAG:-latest}}
- http_proxy=${http_proxy:-} - http_proxy=${http_proxy:-}
- https_proxy=${https_proxy:-} - https_proxy=${https_proxy:-}
image: built-compute
environment: environment:
- PG_VERSION=${PG_VERSION:-16} - PG_VERSION=${PG_VERSION:-16}
- TENANT_ID=${TENANT_ID:-} - TENANT_ID=${TENANT_ID:-}
@@ -166,6 +167,11 @@ services:
- 3080:3080 # http endpoints - 3080:3080 # http endpoints
entrypoint: entrypoint:
- "/shell/compute.sh" - "/shell/compute.sh"
# Ad an alias for compute1 for compatibility
networks:
default:
aliases:
- compute
depends_on: depends_on:
- safekeeper1 - safekeeper1
- safekeeper2 - safekeeper2
@@ -174,15 +180,20 @@ services:
compute_is_ready: compute_is_ready:
image: postgres:latest image: postgres:latest
environment:
- PARALLEL_COMPUTES=1
entrypoint: entrypoint:
- "/bin/bash" - "/bin/sh"
- "-c" - "-c"
command: command:
- "until pg_isready -h compute -p 55433 -U cloud_admin ; do - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do
echo 'Waiting to start compute...' && sleep 1; until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do
done" sleep 1;
done;
done;
echo All computes are started"
depends_on: depends_on:
- compute - compute1
neon-test-extensions: neon-test-extensions:
profiles: ["test-extensions"] profiles: ["test-extensions"]
@@ -196,4 +207,4 @@ services:
command: command:
- sleep 3600 - sleep 3600
depends_on: depends_on:
- compute - compute1

View File

@@ -1,4 +1,4 @@
#!/bin/bash #!/usr/bin/env bash
# A basic test to ensure Docker images are built correctly. # A basic test to ensure Docker images are built correctly.
# Build a wrapper around the compute, start all services and runs a simple SQL query. # Build a wrapper around the compute, start all services and runs a simple SQL query.
@@ -13,9 +13,36 @@
# #
set -eux -o pipefail set -eux -o pipefail
cd "$(dirname "${0}")"
export COMPOSE_FILE='docker-compose.yml' export COMPOSE_FILE='docker-compose.yml'
export COMPOSE_PROFILES=test-extensions export COMPOSE_PROFILES=test-extensions
cd "$(dirname "${0}")" export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
READY_MESSAGE="All computes are started"
COMPUTES=()
for i in $(seq 1 "${PARALLEL_COMPUTES}"); do
COMPUTES+=("compute${i}")
done
CURRENT_TMPDIR=$(mktemp -d)
trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT
if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
export COMPOSE_FILE=docker-compose-parallel.yml
cp docker-compose.yml docker-compose-parallel.yml
# Replace the environment variable PARALLEL_COMPUTES with the actual value
yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE}
for i in $(seq 2 "${PARALLEL_COMPUTES}"); do
# Duplicate compute1 as compute${i} for parallel execution
yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE}
# We don't need these sections, so delete them
yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE}
# Let the compute 1 be the only dependence
yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE}
# Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes
yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE}
# Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes
# They will create new TENANT_ID and TIMELINE_ID anyway.
yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE}
done
fi
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
function cleanup() { function cleanup() {
@@ -27,11 +54,11 @@ function cleanup() {
for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
pg_version=${pg_version/v/} pg_version=${pg_version/v/}
echo "clean up containers if exists" echo "clean up containers if exist"
cleanup cleanup
PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d
echo "wait until the compute is ready. timeout after 60s. " echo "wait until the compute is ready. timeout after 60s. "
cnt=0 cnt=0
while sleep 3; do while sleep 3; do
@@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
echo "timeout before the compute is ready." echo "timeout before the compute is ready."
exit 1 exit 1
fi fi
if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then
echo "OK. The compute is ready to connect." echo "OK. The compute is ready to connect."
echo "execute simple queries." echo "execute simple queries."
docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" for compute in "${COMPUTES[@]}"; do
docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
done
break break
fi fi
done done
if [[ ${pg_version} -ge 16 ]]; then if [[ ${pg_version} -ge 16 ]]; then
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src}
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test"
echo Adding dummy config docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install"
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data"
# Prepare for the PostGIS test docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data"
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
TMPDIR=$(mktemp -d) for compute in "${COMPUTES[@]}"; do
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}" # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install echo Adding dummy config on "${compute}"
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress # Prepare for the PostGIS test
rm -rf "${TMPDIR}" docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test
TMPDIR=$(mktemp -d) docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data" # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/ docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/
rm -rf "${TMPDIR}" # The following block does the same for the contrib/file_fdw test
# The following block does the same for the contrib/file_fdw test docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data
TMPDIR=$(mktemp -d) done
docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
rm -rf "${TMPDIR}"
# Apply patches # Apply patches
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
# We are running tests now # We are running tests now
rm -f testout.txt testout_contrib.txt rm -f testout.txt testout_contrib.txt
# We want to run the longest tests first to better utilize parallelization and reduce overall test time.
# Tests listed in the RUN_FIRST variable will be run before others.
# If parallelization is not used, this environment variable will be ignored.
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
-e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
-e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
CONTRIB_FAILED= CONTRIB_FAILED=

View File

@@ -1,4 +1,4 @@
#!/bin/bash #!/usr/bin/env bash
set -x set -x
if [[ -v BENCHMARK_CONNSTR ]]; then if [[ -v BENCHMARK_CONNSTR ]]; then
@@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then
fi fi
fi fi
REGULAR_USER=false REGULAR_USER=false
while getopts r arg; do PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
case $arg in while getopts pr arg; do
case ${arg} in
r) r)
REGULAR_USER=true REGULAR_USER=true
shift $((OPTIND-1)) shift $((OPTIND-1))
@@ -41,26 +42,49 @@ extdir=${1}
cd "${extdir}" || exit 2 cd "${extdir}" || exit 2
FAILED= FAILED=
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) export FAILED_FILE=/tmp/failed
for d in ${LIST}; do rm -f ${FAILED_FILE}
[ -d "${d}" ] || continue mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
if ! psql -w -c "select 1" >/dev/null; then if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
FAILED="${d} ${FAILED}" # Avoid errors if RUN_FIRST is not defined
break RUN_FIRST=${RUN_FIRST:-}
fi # Move entries listed in the RUN_FIRST variable to the beginning
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"))
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}" parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0
continue export PGHOST=compute{%}
fi if ! psql -c 'select 1'>/dev/null; then
exit 1
fi
echo Running on \${PGHOST}
if [[ -f ${extdir}/{}/neon-test.sh ]]; then
echo Running from script
${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE};
else
echo Running using make;
USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE};
fi" ::: ${ORDERED_LIST}
[[ ! -f ${FAILED_FILE} ]] && exit 0
else
for d in "${LIST[@]}"; do
[ -d "${d}" ] || continue
if ! psql -w -c "select 1" >/dev/null; then
FAILED="${d} ${FAILED}"
break
fi
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
continue
fi
if [ -f "${d}/neon-test.sh" ]; then if [ -f "${d}/neon-test.sh" ]; then
"${d}/neon-test.sh" || FAILED="${d} ${FAILED}" "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
else else
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
fi fi
done done
[ -z "${FAILED}" ] && exit 0 [[ -z ${FAILED} ]] && exit 0
for d in ${FAILED}; do fi
for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do
cat "$(find $d -name regression.diffs)" cat "$(find $d -name regression.diffs)"
done done
for postgis_diff in /tmp/pgis_reg/*_diff; do for postgis_diff in /tmp/pgis_reg/*_diff; do
@@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do
cat "${postgis_diff}" cat "${postgis_diff}"
done done
echo "${FAILED}" echo "${FAILED}"
cat ${FAILED_FILE}
exit 1 exit 1

View File

@@ -1,4 +1,4 @@
#!/bin/bash #!/usr/bin/env bash
set -eux -o pipefail set -eux -o pipefail
cd "$(dirname "${0}")" cd "$(dirname "${0}")"
# Takes a variable name as argument. The result is stored in that variable. # Takes a variable name as argument. The result is stored in that variable.
@@ -60,8 +60,8 @@ function check_timeline() {
# Restarts the compute node with the required compute tag and timeline. # Restarts the compute node with the required compute tag and timeline.
# Accepts the tag for the compute node and the timeline as parameters. # Accepts the tag for the compute node and the timeline as parameters.
function restart_compute() { function restart_compute() {
docker compose down compute compute_is_ready docker compose down compute1 compute_is_ready
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready
wait_for_ready wait_for_ready
check_timeline ${2} check_timeline ${2}
} }

View File

@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
from WAL. from WAL.
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo. instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without
any WAL redo.
### How to get rid of the patch ### How to get rid of the patch

View File

@@ -75,7 +75,7 @@ CLI examples:
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` * AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
For local S3 installations, refer to the their documentation for name format and credentials. For local S3 installations, refer to their documentation for name format and credentials.
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
Required sections are: Required sections are:

View File

@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
up all computes for the change. Neither we want to fully reimplement the leader up all computes for the change. Neither we want to fully reimplement the leader
logic second time outside compute. Because of that the proposed algorithm relies logic second time outside compute. Because of that the proposed algorithm relies
for issuing configurations on the external fault tolerant (distributed) strongly for issuing configurations on the external fault tolerant (distributed) strongly
consisent storage with simple API: CAS (compare-and-swap) on the single key. consistent storage with simple API: CAS (compare-and-swap) on the single key.
Properly configured postgres suits this. Properly configured postgres suits this.
In the system consensus is implemented at the timeline level, so algorithm below In the system consensus is implemented at the timeline level, so algorithm below
@@ -34,7 +34,7 @@ A configuration is
``` ```
struct Configuration { struct Configuration {
generation: Generation, // a number uniquely identifying configuration generation: SafekeeperGeneration, // a number uniquely identifying configuration
sk_set: Vec<NodeId>, // current safekeeper set sk_set: Vec<NodeId>, // current safekeeper set
new_sk_set: Optional<Vec<NodeId>>, new_sk_set: Optional<Vec<NodeId>>,
} }
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
response it sends its current configuration generation to let walproposer know. response it sends its current configuration generation to let walproposer know.
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
current one and ignores it otherwise. In any case it replies with current one and ignores it otherwise. In any case it replies with
``` ```
struct ConfigurationSwitchResponse { struct TimelineMembershipSwitchResponse {
conf: Configuration, conf: Configuration,
term: Term, term: Term,
last_log_term: Term, last_log_term: Term,
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
It should stop talking to safekeepers not listed in the configuration at this It should stop talking to safekeepers not listed in the configuration at this
point, though it is not unsafe to continue doing so. point, though it is not unsafe to continue doing so.
To be elected it must receive votes from both majorites if `new_sk_set` is present. To be elected it must receive votes from both majorities if `new_sk_set` is present.
Similarly, to commit WAL it must receive flush acknowledge from both majorities. Similarly, to commit WAL it must receive flush acknowledge from both majorities.
If walproposer hears from safekeeper configuration higher than his own (i.e. If walproposer hears from safekeeper configuration higher than his own (i.e.
@@ -130,7 +130,7 @@ storage are reachable.
1) Fetch current timeline configuration from the configuration storage. 1) Fetch current timeline configuration from the configuration storage.
2) If it is already joint one and `new_set` is different from `desired_set` 2) If it is already joint one and `new_set` is different from `desired_set`
refuse to change. However, assign join conf to (in memory) var refuse to change. However, assign join conf to (in memory) var
`join_conf` and proceed to step 4 to finish the ongoing change. `joint_conf` and proceed to step 4 to finish the ongoing change.
3) Else, create joint `joint_conf: Configuration`: increment current conf number 3) Else, create joint `joint_conf: Configuration`: increment current conf number
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
storage by doing CAS on the current generation: change happens only if storage by doing CAS on the current generation: change happens only if
@@ -161,11 +161,11 @@ storage are reachable.
because `pull_timeline` already includes it and plus additionally would be because `pull_timeline` already includes it and plus additionally would be
broadcast by compute. More importantly, we may proceed to the next step broadcast by compute. More importantly, we may proceed to the next step
only when `<last_log_term, flush_lsn>` on the majority of the new set reached only when `<last_log_term, flush_lsn>` on the majority of the new set reached
`sync_position`. Similarly, on the happy path no waiting is not needed because `sync_position`. Similarly, on the happy path no waiting is needed because
`pull_timeline` already includes it. However, we should double `pull_timeline` already includes it. However, we should double
check to be safe. For example, timeline could have been created earlier e.g. check to be safe. For example, timeline could have been created earlier e.g.
manually or after try-to-migrate, abort, try-to-migrate-again sequence. manually or after try-to-migrate, abort, try-to-migrate-again sequence.
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
storage under one more CAS. storage under one more CAS.
8) Call `PUT` `configuration` on safekeepers from the new set, 8) Call `PUT` `configuration` on safekeepers from the new set,
@@ -178,12 +178,12 @@ spec of it.
Description above focuses on safety. To make the flow practical and live, here a few more Description above focuses on safety. To make the flow practical and live, here a few more
considerations. considerations.
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 1) It makes sense to ping new set to ensure we are migrating to live node(s) before
step 3. step 3.
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
it is safe to rollback to the old conf with one more CAS. it is safe to rollback to the old conf with one more CAS.
3) On step 4 timeline might be already created on members of the new set for various reasons; 3) On step 4 timeline might be already created on members of the new set for various reasons;
the simplest is the procedure restart. There are more complicated scenarious like mentioned the simplest is the procedure restart. There are more complicated scenarios like mentioned
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
generations, so seems simpler to treat existing timeline as success. However, this also generations, so seems simpler to treat existing timeline as success. However, this also
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
@@ -192,7 +192,7 @@ considerations.
4) In the end timeline should be locally deleted on the safekeeper(s) which are 4) In the end timeline should be locally deleted on the safekeeper(s) which are
in the old set but not in the new one, unless they are unreachable. To be in the old set but not in the new one, unless they are unreachable. To be
safe this also should be done under generation number (deletion proceeds only if safe this also should be done under generation number (deletion proceeds only if
current configuration is <= than one in request and safekeeper is not memeber of it). current configuration is <= than one in request and safekeeper is not member of it).
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
jump to step 7, using it as `new_conf`. jump to step 7, using it as `new_conf`.
@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
Response should be augmented with `safekeepers_generation` and `safekeepers` Response should be augmented with `safekeepers_generation` and `safekeepers`
fields like described in `/notify-safekeepers` above. Initially (currently) fields like described in `/notify-safekeepers` above. Initially (currently)
these fields may be absent; in this case cplane chooses safekeepers on its own these fields may be absent; in this case cplane chooses safekeepers on its own
like it currently does. The call should be retried until succeeds. like it currently does. The call should be retried until it succeeds.
Timeline deletion and tenant deletion in cplane should call appropriate Timeline deletion and tenant deletion in cplane should call appropriate
storage_controller endpoints like it currently does for sharded tenants. The storage_controller endpoints like it currently does for sharded tenants. The
calls should be retried until they succeed. calls should be retried until they succeed.
When compute receives safekeepers list from control plane it needs to know the When compute receives safekeeper list from control plane it needs to know the
generation to checked whether it should be updated (note that compute may get generation to check whether it should be updated (note that compute may get
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers` safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
GUC is just a comma separates list of `host:port`. Let's prefix it with GUC is just a comma separates list of `host:port`. Let's prefix it with
`g#<generation>:` to this end, so it will look like `g#<generation>:` to this end, so it will look like
@@ -305,8 +305,8 @@ enum MigrationRequest {
``` ```
`FinishPending` requests to run the procedure to ensure state is clean: current `FinishPending` requests to run the procedure to ensure state is clean: current
configuration is not joint and majority of safekeepers are aware of it, but do configuration is not joint and the majority of safekeepers are aware of it, but do
not attempt to migrate anywhere. If current configuration fetched on step 1 is not attempt to migrate anywhere. If the current configuration fetched on step 1 is
not joint it jumps to step 7. It should be run at startup for all timelines (but not joint it jumps to step 7. It should be run at startup for all timelines (but
similarly, in the first version it is ok to trigger it manually). similarly, in the first version it is ok to trigger it manually).
@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
`safekeepers` table mirroring current `nodes` should be added, except that for `safekeepers` table mirroring current `nodes` should be added, except that for
`scheduling_policy`: it is enough to have at least in the beginning only 3 `scheduling_policy`: it is enough to have at least in the beginning only 3
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
3) `decomissioned` (node is removed). 3) `decommissioned` (node is removed).
`timelines` table: `timelines` table:
``` ```
@@ -326,9 +326,10 @@ table! {
tenant_id -> Varchar, tenant_id -> Varchar,
start_lsn -> pg_lsn, start_lsn -> pg_lsn,
generation -> Int4, generation -> Int4,
sk_set -> Array<Int4>, // list of safekeeper ids sk_set -> Array<Int8>, // list of safekeeper ids
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
cplane_notified_generation -> Int4, cplane_notified_generation -> Int4,
sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
deleted_at -> Nullable<Timestamptz>, deleted_at -> Nullable<Timestamptz>,
} }
} }
@@ -338,13 +339,23 @@ table! {
might also want to add ancestor_timeline_id to preserve the hierarchy, but for might also want to add ancestor_timeline_id to preserve the hierarchy, but for
this RFC it is not needed. this RFC it is not needed.
`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
track the last stage of the algorithm, when we need to notify safekeeper set and cplane
with the final configuration after it's already committed to DB.
The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
`*_notified_generation` fields are up to date with `generation`.
It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
but for better observability it's nice to have them separately.
#### API #### API
Node management is similar to pageserver: Node management is similar to pageserver:
1) POST `/control/v1/safekeepers` inserts safekeeper. 1) POST `/control/v1/safekeeper` inserts safekeeper.
2) GET `/control/v1/safekeepers` lists safekeepers. 2) GET `/control/v1/safekeeper` lists safekeepers.
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. 3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. 4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
`offline` or `decomissioned`. Initially it is simpler not to schedule any `offline` or `decomissioned`. Initially it is simpler not to schedule any
migrations here. migrations here.
@@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative:
all timelines from one safekeeper to another. It accepts json all timelines from one safekeeper to another. It accepts json
``` ```
{ {
"src_sk": u32, "src_sk": NodeId,
"dst_sk": u32, "dst_sk": NodeId,
"limit": Optional<u32>, "limit": Optional<u32>,
} }
``` ```
@@ -379,12 +390,15 @@ Returns list of scheduled requests.
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
to move single timeline to given set of safekeepers: to move single timeline to given set of safekeepers:
``` ```
{ struct TimelineSafekeeperMigrateRequest {
"desired_set": Vec<u32>, "new_sk_set": Vec<NodeId>,
} }
``` ```
Returns scheduled request. In the first version the handler migrates the timeline to `new_sk_set` synchronously.
Should be retried until success.
In the future we might change it to asynchronous API and return scheduled request.
Similar call should be added for the tenant. Similar call should be added for the tenant.
@@ -434,6 +448,9 @@ table! {
} }
``` ```
We load all pending ops from the table on startup into the memory.
The table is needed only to preserve the state between restarts.
`op_type` can be `include` (seed from peers and ensure generation is up to `op_type` can be `include` (seed from peers and ensure generation is up to
date), `exclude` (remove locally) and `delete`. Field is actually not strictly date), `exclude` (remove locally) and `delete`. Field is actually not strictly
needed as it can be computed from current configuration, but gives more explicit needed as it can be computed from current configuration, but gives more explicit
@@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
the initial (tenant creation) call cplane doesn't know it. However, setting the initial (tenant creation) call cplane doesn't know it. However, setting
start_lsn on safekeepers during creation is a good thing -- it provides a start_lsn on safekeepers during creation is a good thing -- it provides a
guarantee that walproposer can always find a common point in WAL histories of guarantee that walproposer can always find a common point in WAL histories of
safekeeper and its own, and so absense of it would be a clear sign of safekeeper and its own, and so absence of it would be a clear sign of
corruption. The following sequence works: corruption. The following sequence works:
1) Create timeline (or observe that it exists) on pageserver, 1) Create timeline (or observe that it exists) on pageserver,
figuring out last_record_lsn in response. figuring out last_record_lsn in response.
@@ -497,11 +514,9 @@ corruption. The following sequence works:
retries the call until 200 response. retries the call until 200 response.
There is a small question how request handler (timeline creation in this There is a small question how request handler (timeline creation in this
case) would interact with per sk reconciler. As always I prefer to do the case) would interact with per sk reconciler. In the current implementation
simplest possible thing and here it seems to be just waking it up so it we first persist the request in the DB, and then send an in-memory request
re-reads the db for work to do. Passing work in memory is faster, but to each safekeeper reconciler to process it.
that shouldn't matter, and path to scan db for work will exist anyway,
simpler to reuse it.
For pg version / wal segment size: while we may persist them in `timelines` For pg version / wal segment size: while we may persist them in `timelines`
table, it is not necessary as initial creation at step 3 can take them from table, it is not necessary as initial creation at step 3 can take them from
@@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them
around. around.
Timeline migration. Timeline migration.
1) CAS to the db to create joint conf, and in the same transaction create 1) CAS to the db to create joint conf. Since this moment the migration is considered to be
`safekeeper_timeline_pending_ops` `include` entries to initialize new members "in progress". We can detect all "in-progress" migrations looking into the database.
as well as deliver this conf to current ones; poke per sk reconcilers to work 2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
on it. Also any conf change should also poke cplane notifier task(s). configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
2) Once it becomes possible per alg description above, get out of joint conf so we don't need to persist anything in the database at this stage. If any errors occur,
with another CAS. Task should get wakeups from per sk reconcilers because it's safe to retry or abort the migration.
conf switch is required for advancement; however retries should be sleep 3) Once it becomes possible per alg description above, get out of joint conf
based as well as LSN advancement might be needed, though in happy path with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
it isn't. To see whether further transition is possible on wakup migration in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
executor polls safekeepers per the algorithm. CAS creating new conf with only CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
new members should again insert entries to `safekeeper_timeline_pending_ops` need to have them persisted somewhere in case the migration is interrupted right after the CAS.
to switch them there, as well as `exclude` rows to remove timeline from 4) Finish the migration. The final membership configuration is committed to the DB at this stage.
old members. So, the migration can not be aborted anymore. But it can still be retried if the migration fails
past stage 3. To finish the migration we need to send the new membership configuration to
a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
possible that we have already committed `exclude` requests to DB, but didn't send them to
the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
because it's the only place where they are persistent. The fields `sk_set_notified_generation`
and `cplane_notified_generation` are updated after each step. The migration is considered
fully completed when they match the `generation` field.
In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
so the timeline is always in a "good state" and doesn't require an old quorum to commit
WAL after the migration reported "success".
Timeline deletion: just set `deleted_at` on the timeline row and insert Timeline deletion: just set `deleted_at` on the timeline row and insert
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
per sk reconcilers. per sk reconcilers.
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops` When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
for it must be cleared in the same transaction. for it must be cleared in the same transaction.
One more task pool should infinitely retry notifying control plane about changed
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
#### Dealing with multiple instances of storage_controller #### Dealing with multiple instances of storage_controller
Operations described above executed concurrently might create some errors but do Operations described above executed concurrently might create some errors but do
@@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.
To harden against some controller instance creating some work in To harden against some controller instance creating some work in
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
the job per sk reconcilers apart from explicit wakups should scan for work the job per sk reconcilers apart from explicit wakeups should scan for work
periodically. It is possible to remove that though if all db updates are periodically. It is possible to remove that though if all db updates are
protected with leadership token/term -- then such scans are needed only after protected with leadership token/term -- then such scans are needed only after
leadership is acquired. leadership is acquired.
@@ -563,7 +588,7 @@ There should be following layers of tests:
safekeeper communication and pull_timeline need to be mocked and main switch safekeeper communication and pull_timeline need to be mocked and main switch
procedure wrapped to as a node (thread) in simulation tests, using these procedure wrapped to as a node (thread) in simulation tests, using these
mocks. Test would inject migrations like it currently injects mocks. Test would inject migrations like it currently injects
safekeeper/walproposer restars. Main assert is the same -- committed WAL must safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
not be lost. not be lost.
3) Since simulation testing injects at relatively high level points (not 3) Since simulation testing injects at relatively high level points (not
@@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout:
`notify-safekeepers`. `notify-safekeepers`.
Then the rollout for a region would be: Then the rollout for a region would be:
- Current situation: safekeepers are choosen by control_plane. - Current situation: safekeepers are chosen by control_plane.
- We manually migrate some timelines, test moving them around. - We manually migrate some timelines, test moving them around.
- Then we enable `--set-safekeepers` so that all new timelines - Then we enable `--set-safekeepers` so that all new timelines
are on storage controller. are on storage controller.

View File

@@ -13,6 +13,8 @@ use utils::backoff::retry;
pub fn app(state: Arc<Storage>) -> Router<()> { pub fn app(state: Arc<Storage>) -> Router<()> {
use axum::routing::{delete as _delete, get as _get}; use axum::routing::{delete as _delete, get as _get};
let delete_prefix = _delete(delete_prefix); let delete_prefix = _delete(delete_prefix);
// NB: On any changes do not forget to update the OpenAPI spec
// in /endpoint_storage/src/openapi_spec.yml.
Router::new() Router::new()
.route( .route(
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",

View File

@@ -0,0 +1,146 @@
openapi: "3.0.2"
info:
title: Endpoint Storage API
description: Endpoint Storage API
version: "1.0"
license:
name: "Apache"
url: https://github.com/neondatabase/neon/blob/main/LICENSE
servers:
- url: ""
paths:
/status:
description: Healthcheck endpoint
get:
description: Healthcheck
security: []
responses:
"200":
description: OK
/{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
- name: endpoint_id
in: path
required: true
schema:
type: string
- name: key
in: path
required: true
schema:
type: string
get:
description: Get file from blob storage
responses:
"200":
description: "File stream from blob storage"
content:
application/octet-stream:
schema:
type: string
format: binary
"400":
description: File was not found
"403":
description: JWT does not authorize request to this route
put:
description: Insert file into blob storage. If file exists, override it
requestBody:
content:
application/octet-stream:
schema:
type: string
format: binary
responses:
"200":
description: File was inserted successfully
"403":
description: JWT does not authorize request to this route
delete:
description: Delete file from blob storage
responses:
"200":
description: File was successfully deleted or not found
"403":
description: JWT does not authorize request to this route
/{tenant_id}/{timeline_id}/{endpoint_id}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
- name: endpoint_id
in: path
required: true
schema:
type: string
delete:
description: Delete endpoint data from blob storage
responses:
"200":
description: Endpoint data was deleted
"403":
description: JWT does not authorize request to this route
/{tenant_id}/{timeline_id}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
delete:
description: Delete timeline data from blob storage
responses:
"200":
description: Timeline data was deleted
"403":
description: JWT does not authorize request to this route
/{tenant_id}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
delete:
description: Delete tenant data from blob storage
responses:
"200":
description: Tenant data was deleted
"403":
description: JWT does not authorize request to this route
components:
securitySchemes:
JWT:
type: http
scheme: bearer
bearerFormat: JWT
security:
- JWT: []

View File

@@ -46,16 +46,45 @@ pub struct ExtensionInstallResponse {
pub version: ExtVersion, pub version: ExtVersion,
} }
#[derive(Serialize, Default, Debug, Clone)] /// Status of the LFC prewarm process. The same state machine is reused for
/// both autoprewarm (prewarm after compute/Postgres start using the previously
/// stored LFC state) and explicit prewarming via API.
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
#[serde(tag = "status", rename_all = "snake_case")] #[serde(tag = "status", rename_all = "snake_case")]
pub enum LfcPrewarmState { pub enum LfcPrewarmState {
/// Default value when compute boots up.
#[default] #[default]
NotPrewarmed, NotPrewarmed,
/// Prewarming thread is active and loading pages into LFC.
Prewarming, Prewarming,
/// We found requested LFC state in the endpoint storage and
/// completed prewarming successfully.
Completed, Completed,
Failed { /// Unexpected error happened during prewarming. Note, `Not Found 404`
error: String, /// response from the endpoint storage is explicitly excluded here
}, /// because it can normally happen on the first compute start,
/// since LFC state is not available yet.
Failed { error: String },
/// We tried to fetch the corresponding LFC state from the endpoint storage,
/// but received `Not Found 404`. This should normally happen only during the
/// first endpoint start after creation with `autoprewarm: true`.
///
/// During the orchestrated prewarm via API, when a caller explicitly
/// provides the LFC state key to prewarm from, it's the caller responsibility
/// to handle this status as an error state in this case.
Skipped,
}
impl Display for LfcPrewarmState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
LfcPrewarmState::Completed => f.write_str("Completed"),
LfcPrewarmState::Skipped => f.write_str("Skipped"),
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
}
}
} }
#[derive(Serialize, Default, Debug, Clone, PartialEq)] #[derive(Serialize, Default, Debug, Clone, PartialEq)]
@@ -70,6 +99,23 @@ pub enum LfcOffloadState {
}, },
} }
#[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(tag = "status", rename_all = "snake_case")]
/// Response of /promote
pub enum PromoteState {
NotPromoted,
Completed,
Failed { error: String },
}
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
#[serde(rename_all = "snake_case")]
/// Result of /safekeepers_lsn
pub struct SafekeepersLsn {
pub safekeepers: String,
pub wal_flush_lsn: utils::lsn::Lsn,
}
/// Response of the /status API /// Response of the /status API
#[derive(Serialize, Debug, Deserialize)] #[derive(Serialize, Debug, Deserialize)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
@@ -93,6 +139,15 @@ pub enum TerminateMode {
Immediate, Immediate,
} }
impl From<TerminateMode> for ComputeStatus {
fn from(mode: TerminateMode) -> Self {
match mode {
TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
}
}
}
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum ComputeStatus { pub enum ComputeStatus {
@@ -113,7 +168,9 @@ pub enum ComputeStatus {
// control-plane to terminate it. // control-plane to terminate it.
Failed, Failed,
// Termination requested // Termination requested
TerminationPending { mode: TerminateMode }, TerminationPendingFast,
// Termination requested, without waiting 30s before returning from /terminate
TerminationPendingImmediate,
// Terminated Postgres // Terminated Postgres
Terminated, Terminated,
} }
@@ -132,7 +189,10 @@ impl Display for ComputeStatus {
ComputeStatus::Running => f.write_str("running"), ComputeStatus::Running => f.write_str("running"),
ComputeStatus::Configuration => f.write_str("configuration"), ComputeStatus::Configuration => f.write_str("configuration"),
ComputeStatus::Failed => f.write_str("failed"), ComputeStatus::Failed => f.write_str("failed"),
ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"), ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
ComputeStatus::TerminationPendingImmediate => {
f.write_str("termination-pending-immediate")
}
ComputeStatus::Terminated => f.write_str("terminated"), ComputeStatus::Terminated => f.write_str("terminated"),
} }
} }

View File

@@ -14,6 +14,7 @@ use serde::{Deserialize, Serialize};
use url::Url; use url::Url;
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
use utils::shard::{ShardCount, ShardIndex};
use crate::responses::TlsConfig; use crate::responses::TlsConfig;
@@ -106,11 +107,18 @@ pub struct ComputeSpec {
pub tenant_id: Option<TenantId>, pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>, pub timeline_id: Option<TimelineId>,
// Pageserver information can be passed in two different ways: /// Pageserver information can be passed in three different ways:
// 1. Here /// 1. Here in `pageserver_connection_info`
// 2. in cluster.settings. This is legacy, we are switching to method 1. /// 2. In the `pageserver_connstring` field.
/// 3. in `cluster.settings`.
///
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
/// `pageserver_connection_info` field is missing.
pub pageserver_connection_info: Option<PageserverConnectionInfo>, pub pageserver_connection_info: Option<PageserverConnectionInfo>,
pub pageserver_connstring: Option<String>,
// More neon ids that we expose to the compute_ctl // More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs. // and to postgres as neon extension GUCs.
pub project_id: Option<String>, pub project_id: Option<String>,
@@ -145,7 +153,7 @@ pub struct ComputeSpec {
// Stripe size for pageserver sharding, in pages // Stripe size for pageserver sharding, in pages
#[serde(default)] #[serde(default)]
pub shard_stripe_size: Option<usize>, pub shard_stripe_size: Option<u32>,
/// Local Proxy configuration used for JWT authentication /// Local Proxy configuration used for JWT authentication
#[serde(default)] #[serde(default)]
@@ -218,16 +226,28 @@ pub enum ComputeFeature {
UnknownFeature, UnknownFeature,
} }
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality. #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverConnectionInfo { pub struct PageserverConnectionInfo {
pub shards: HashMap<u32, PageserverShardConnectionInfo>, /// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
pub shard_count: ShardCount,
pub prefer_grpc: bool, /// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
pub stripe_size: Option<u32>,
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
#[serde(default)]
pub prefer_protocol: PageserverProtocol,
} }
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)] #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardInfo {
pub pageservers: Vec<PageserverShardConnectionInfo>,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardConnectionInfo { pub struct PageserverShardConnectionInfo {
pub id: Option<String>,
pub libpq_url: Option<String>, pub libpq_url: Option<String>,
pub grpc_url: Option<String>, pub grpc_url: Option<String>,
} }
@@ -465,13 +485,15 @@ pub struct JwksSettings {
pub jwt_audience: Option<String>, pub jwt_audience: Option<String>,
} }
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. /// Protocol used to connect to a Pageserver.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
pub enum PageserverProtocol { pub enum PageserverProtocol {
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
#[default] #[default]
#[serde(rename = "libpq")]
Libpq, Libpq,
/// A newer, gRPC-based protocol. Uses grpc:// scheme. /// A newer, gRPC-based protocol. Uses grpc:// scheme.
#[serde(rename = "grpc")]
Grpc, Grpc,
} }

View File

@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
use tokio_util::io::ReaderStream; use tokio_util::io::ReaderStream;
use tracing::{Instrument, debug, info, info_span, warn}; use tracing::{Instrument, debug, info, info_span, warn};
use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use utils::auth::{AuthError, Claims, SwappableJwtAuth};
use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
use crate::error::{ApiError, api_error_handler, route_error_handler}; use crate::error::{ApiError, api_error_handler, route_error_handler};
use crate::request::{get_query_param, parse_query_param}; use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
} }
} }
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> { pub async fn prometheus_metrics_handler(
req: Request<Body>,
force_metric_collection_on_scrape: bool,
) -> Result<Response<Body>, ApiError> {
SERVE_METRICS_COUNT.inc(); SERVE_METRICS_COUNT.inc();
// HADRON
let requested_use_latest = parse_query_param(&req, "use_latest")?;
let use_latest = match requested_use_latest {
None => force_metric_collection_on_scrape,
Some(true) => true,
Some(false) => {
if force_metric_collection_on_scrape {
// We don't cache in this case
true
} else {
false
}
}
};
let started_at = std::time::Instant::now(); let started_at = std::time::Instant::now();
let (tx, rx) = mpsc::channel(1); let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
let _span = span.entered(); let _span = span.entered();
let metrics = metrics::gather(); // HADRON
let collected = if use_latest {
// Skip caching the results if we always force metric collection on scrape.
METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
} else {
METRICS_COLLECTOR.last_collected()
};
let gathered_at = std::time::Instant::now(); let gathered_at = std::time::Instant::now();
let res = encoder let res = encoder
.encode(&metrics, &mut writer) .encode(&collected.metrics, &mut writer)
.and_then(|_| writer.flush().map_err(|e| e.into())); .and_then(|_| writer.flush().map_err(|e| e.into()));
// this instant is not when we finally got the full response sent, sending is done by hyper // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
let encoded_in = encoded_at - gathered_at - writer.wait_time(); let encoded_in = encoded_at - gathered_at - writer.wait_time();
let total = encoded_at - started_at; let total = encoded_at - started_at;
// HADRON
let staleness_ms = (encoded_at - collected.collected_at).as_millis();
METRICS_STALE_MILLIS.set(staleness_ms as i64);
match res { match res {
Ok(()) => { Ok(()) => {
tracing::info!( tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
spawning_ms = spawned_in.as_millis(), spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(), collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(), encoding_ms = encoded_in.as_millis(),
stalenss_ms = staleness_ms,
"responded /metrics" "responded /metrics"
); );
} }

View File

@@ -41,17 +41,35 @@ pub fn get_query_param<'a>(
Some(q) => q, Some(q) => q,
None => return Ok(None), None => return Ok(None),
}; };
let mut values = url::form_urlencoded::parse(query.as_bytes()) let values = url::form_urlencoded::parse(query.as_bytes())
.filter_map(|(k, v)| if k == param_name { Some(v) } else { None }) .filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
// we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
.fuse(); .fuse();
let value1 = values.next(); // Work around an issue with Alloy's pyroscope scrape where the "seconds"
if values.next().is_some() { // parameter is added several times. https://github.com/grafana/alloy/issues/3026
return Err(ApiError::BadRequest(anyhow!( // TODO: revert after Alloy is fixed.
"param {param_name} specified more than once" let value1 = values
))); .map(Ok)
} .reduce(|acc, i| {
match acc {
Err(_) => acc,
// It's okay to have duplicates as along as they have the same value.
Ok(ref a) if a == &i.unwrap() => acc,
_ => Err(ApiError::BadRequest(anyhow!(
"param {param_name} specified more than once"
))),
}
})
.transpose()?;
// if values.next().is_some() {
// return Err(ApiError::BadRequest(anyhow!(
// "param {param_name} specified more than once"
// )));
// }
Ok(value1) Ok(value1)
} }
@@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
None => Ok(()), None => Ok(()),
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_query_param_duplicate() {
let req = Request::builder()
.uri("http://localhost:12345/testuri?testparam=1")
.body(hyper::Body::empty())
.unwrap();
let value = get_query_param(&req, "testparam").unwrap();
assert_eq!(value.unwrap(), "1");
let req = Request::builder()
.uri("http://localhost:12345/testuri?testparam=1&testparam=1")
.body(hyper::Body::empty())
.unwrap();
let value = get_query_param(&req, "testparam").unwrap();
assert_eq!(value.unwrap(), "1");
let req = Request::builder()
.uri("http://localhost:12345/testuri")
.body(hyper::Body::empty())
.unwrap();
let value = get_query_param(&req, "testparam").unwrap();
assert!(value.is_none());
let req = Request::builder()
.uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
.body(hyper::Body::empty())
.unwrap();
let value = get_query_param(&req, "testparam");
assert!(value.is_err());
}
}

View File

@@ -4,12 +4,14 @@
//! a default registry. //! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)] #![deny(clippy::undocumented_unsafe_blocks)]
use std::sync::RwLock;
use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels};
use measured::metric::counter::CounterState; use measured::metric::counter::CounterState;
use measured::metric::gauge::GaugeState; use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding; use measured::metric::group::Encoding;
use measured::metric::name::{MetricName, MetricNameEncoder}; use measured::metric::name::{MetricName, MetricNameEncoder};
use measured::metric::{MetricEncoding, MetricFamilyEncoding}; use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use prometheus::Registry; use prometheus::Registry;
@@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
.collect() .collect()
} }
pub struct InfoMetric<L: LabelGroup, M: MetricType = GaugeState> {
label: RwLock<L>,
metric: M,
}
impl<L: LabelGroup> InfoMetric<L> {
pub fn new(label: L) -> Self {
Self::with_metric(label, GaugeState::new(1))
}
}
impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
pub fn with_metric(label: L, metric: M) -> Self {
Self {
label: RwLock::new(label),
metric,
}
}
pub fn set_label(&self, label: L) {
*self.label.write().unwrap() = label;
}
}
impl<L, M, E> MetricFamilyEncoding<E> for InfoMetric<L, M>
where
L: LabelGroup,
M: MetricEncoding<E, Metadata = ()>,
E: Encoding,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut E,
) -> Result<(), E::Err> {
M::write_type(&name, enc)?;
self.metric
.collect_into(&(), &*self.label.read().unwrap(), name, enc)
}
}
pub struct BuildInfo { pub struct BuildInfo {
pub revision: &'static str, pub revision: &'static str,
pub build_tag: &'static str, pub build_tag: &'static str,
} }
// todo: allow label group without the set
impl LabelGroup for BuildInfo { impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) { fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision"); const REVISION: &LabelName = LabelName::from_str("revision");
@@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo {
} }
} }
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)] #[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))] #[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics { pub struct NeonMetrics {
@@ -165,8 +189,8 @@ pub struct NeonMetrics {
#[derive(MetricGroup)] #[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))] #[metric(new(build_info: BuildInfo))]
pub struct LibMetrics { pub struct LibMetrics {
#[metric(init = build_info)] #[metric(init = InfoMetric::new(build_info))]
build_info: BuildInfo, build_info: InfoMetric<BuildInfo>,
#[metric(flatten)] #[metric(flatten)]
rusage: Rusage, rusage: Rusage,

View File

@@ -16,6 +16,7 @@
//! //!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock. //! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash}; use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit; use std::mem::MaybeUninit;
@@ -56,6 +57,22 @@ pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
num_buckets: u32, num_buckets: u32,
} }
impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory. /// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should /// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader(). /// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
@@ -71,6 +88,20 @@ pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {} unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {} unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> { impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table. /// Change the 'hasher' used by the hash table.
/// ///
@@ -298,7 +329,7 @@ where
/// Get a reference to the entry containing a key. /// Get a reference to the entry containing a key.
/// ///
/// NB: THis takes a write lock as there's no way to distinguish whether the intention /// NB: This takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance. /// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> { pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key); let hash = self.get_hash_value(&key);

View File

@@ -1,5 +1,6 @@
//! Simple hash table with chaining. //! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash; use std::hash::Hash;
use std::mem::MaybeUninit; use std::mem::MaybeUninit;
@@ -17,6 +18,19 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>, pub(crate) inner: Option<(K, V)>,
} }
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation. /// Core hash table implementation.
pub(crate) struct CoreHashMap<'a, K, V> { pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices. /// Dictionary used to map hashes to bucket indices.
@@ -31,6 +45,22 @@ pub(crate) struct CoreHashMap<'a, K, V> {
pub(crate) buckets_in_use: u32, pub(crate) buckets_in_use: u32,
} }
impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed. /// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct FullError; pub struct FullError;

Some files were not shown because too many files have changed in this diff Show More