Compare commits

..

9 Commits

Author SHA1 Message Date
Devin AI
7cdb292b37 Add pg_tenant_only_auth_public_key_path check to safekeeper authentication
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 13:49:18 +00:00
Devin AI
a55e0192dc Add missing bail import to pageserver.rs
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 10:34:31 +00:00
Devin AI
fa3ceab30e Fix formatting issues
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 10:15:37 +00:00
Devin AI
7710c18761 Check both HTTP and PostgreSQL authentication in safekeeper
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:52:25 +00:00
Devin AI
1e14d784f4 Update allowed_errors.py to include new dev mode warning message
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:51:39 +00:00
Devin AI
2574bbe072 Update dev mode warning message to match storage controller
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:51:15 +00:00
Devin AI
d74c715602 Check pg_auth_type in pageserver authentication check
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:50:04 +00:00
Devin AI
69cfd1f7e0 Remove redundant --dev flag in neon_local.rs
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:47:21 +00:00
Devin AI
5dea3e2195 storage: require authentication by default, unless --dev is specified (#11237)
Co-Authored-By: John Spray <john@neon.tech>
2025-04-10 08:43:15 +00:00
263 changed files with 4091 additions and 9706 deletions

View File

@@ -19,7 +19,7 @@
!pageserver/
!pgxn/
!proxy/
!endpoint_storage/
!object_storage/
!storage_scrubber/
!safekeeper/
!storage_broker/

View File

@@ -6,7 +6,6 @@ self-hosted-runner:
- small
- small-metal
- small-arm64
- unit-perf
- us-east-2
config-variables:
- AWS_ECR_REGION

View File

@@ -70,7 +70,6 @@ runs:
- name: Install Allure
shell: bash -euxo pipefail {0}
working-directory: /tmp
run: |
if ! which allure; then
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip

View File

@@ -113,6 +113,8 @@ runs:
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FAILED: ${{ inputs.rerun_failed }}
PG_VERSION: ${{ inputs.pg_version }}
SANITIZERS: ${{ inputs.sanitizers }}
@@ -133,7 +135,6 @@ runs:
fi
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV}
rm -rf $PERF_REPORT_DIR
TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
@@ -210,12 +211,11 @@ runs:
--verbose \
-rA $TEST_SELECTION $EXTRA_PARAMS
- name: Upload performance report
if: ${{ !cancelled() && inputs.save_perf_report == 'true' }}
shell: bash -euxo pipefail {0}
run: |
export REPORT_FROM="${PERF_REPORT_DIR}"
scripts/generate_and_push_perf_report.sh
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
export REPORT_FROM="$PERF_REPORT_DIR"
export REPORT_TO="$PLATFORM"
scripts/generate_and_push_perf_report.sh
fi
- name: Upload compatibility snapshot
# Note, that we use `github.base_ref` which is a target branch for a PR

View File

@@ -272,13 +272,10 @@ jobs:
# run pageserver tests with different settings
for get_vectored_concurrent_io in sequential sidecar-task; do
for io_engine in std-fs tokio-epoll-uring ; do
for io_mode in buffered direct direct-rw ; do
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
${cov_prefix} \
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
done
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
${cov_prefix} \
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
done
done
@@ -349,7 +346,7 @@ jobs:
contents: read
statuses: write
needs: [ build-neon ]
runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
container:
image: ${{ inputs.build-tools-image }}
credentials:
@@ -395,7 +392,6 @@ jobs:
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
# Temporary disable this step until we figure out why it's so flaky

View File

@@ -53,13 +53,10 @@ jobs:
|| inputs.component-name == 'Compute' && 'release-compute'
}}
run: |
now_date=$(date -u +'%Y-%m-%d')
now_time=$(date -u +'%H-%M-%Z')
{
echo "title=${COMPONENT_NAME} release ${now_date}"
echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
echo "release-branch=${RELEASE_BRANCH}"
} | tee -a ${GITHUB_OUTPUT}
today=$(date +'%Y-%m-%d')
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
- name: Configure git
run: |

View File

@@ -165,5 +165,5 @@ jobs:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CURRENT_SHA: ${{ github.sha }}
run: |
RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))')
RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))')
echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT

View File

@@ -284,7 +284,7 @@ jobs:
statuses: write
contents: write
pull-requests: write
runs-on: [ self-hosted, unit-perf ]
runs-on: [ self-hosted, small-metal ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
@@ -323,8 +323,6 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
SYNC_BETWEEN_TESTS: true
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -1238,7 +1236,7 @@ jobs:
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
TIMEOUT=5400 # 90 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
INTERVAL=15 # try each N seconds
last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
@@ -1273,7 +1271,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:

View File

@@ -27,17 +27,15 @@ jobs:
- name: Fast forwarding
uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
# See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
if: ${{ github.event.pull_request.mergeable_state == 'clean' }}
with:
merge: true
comment: on-error
github_token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Comment if mergeable_state is not clean
if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
if: ${{ github.event.pull_request.mergeable_state != 'clean' }}
run: |
gh pr comment ${{ github.event.pull_request.number }} \
--repo "${GITHUB_REPOSITORY}" \
--body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."
--body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."

View File

@@ -30,7 +30,7 @@ permissions:
statuses: write # require for posting a status update
env:
DEFAULT_PG_VERSION: 17
DEFAULT_PG_VERSION: 16
PLATFORM: neon-captest-new
AWS_DEFAULT_REGION: eu-central-1
@@ -42,8 +42,6 @@ jobs:
github-event-name: ${{ github.event_name }}
build-build-tools-image:
permissions:
packages: write
needs: [ check-permissions ]
uses: ./.github/workflows/build-build-tools-image.yml
secrets: inherit

View File

@@ -1,93 +0,0 @@
name: Random Operations Test
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '23 */2 * * *' # runs every 2 hours
workflow_dispatch:
inputs:
random_seed:
type: number
description: 'The random seed'
required: false
default: 0
num_operations:
type: number
description: "The number of operations to test"
default: 250
defaults:
run:
shell: bash -euxo pipefail {0}
permissions: {}
env:
DEFAULT_PG_VERSION: 16
PLATFORM: neon-captest-new
AWS_DEFAULT_REGION: eu-central-1
jobs:
run-random-rests:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
runs-on: small
permissions:
id-token: write
statuses: write
strategy:
fail-fast: false
matrix:
pg-version: [16, 17]
container:
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --init
steps:
- name: Harden the runner (Audit all outbound calls)
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Run tests
uses: ./.github/actions/run-python-test-set
with:
build_type: remote
test_selection: random_ops
run_in_parallel: false
extra_params: -m remote_cluster
pg_version: ${{ matrix.pg-version }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
RANDOM_SEED: ${{ inputs.random_seed }}
NUM_OPERATIONS: ${{ inputs.num_operations }}
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

149
Cargo.lock generated
View File

@@ -40,7 +40,7 @@ dependencies = [
"getrandom 0.2.11",
"once_cell",
"version_check",
"zerocopy 0.7.31",
"zerocopy",
]
[[package]]
@@ -1416,7 +1416,6 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"base64 0.13.1",
"camino",
"clap",
"comfy-table",
@@ -1426,12 +1425,10 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.30",
"jsonwebtoken",
"nix 0.27.1",
"once_cell",
"pageserver_api",
"pageserver_client",
"pem",
"postgres_backend",
"postgres_connection",
"regex",
@@ -1440,8 +1437,6 @@ dependencies = [
"scopeguard",
"serde",
"serde_json",
"sha2",
"spki 0.7.3",
"storage_broker",
"thiserror 1.0.69",
"tokio",
@@ -2037,33 +2032,6 @@ dependencies = [
"zeroize",
]
[[package]]
name = "endpoint_storage"
version = "0.0.1"
dependencies = [
"anyhow",
"axum",
"axum-extra",
"camino",
"camino-tempfile",
"futures",
"http-body-util",
"itertools 0.10.5",
"jsonwebtoken",
"prometheus",
"rand 0.8.5",
"remote_storage",
"serde",
"serde_json",
"test-log",
"tokio",
"tokio-util",
"tower 0.5.2",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "enum-map"
version = "2.5.0"
@@ -2849,7 +2817,6 @@ dependencies = [
"hyper 0.14.30",
"itertools 0.10.5",
"jemalloc_pprof",
"jsonwebtoken",
"metrics",
"once_cell",
"pprof",
@@ -2870,7 +2837,6 @@ dependencies = [
"utils",
"uuid",
"workspace_hack",
"x509-cert",
]
[[package]]
@@ -4025,6 +3991,33 @@ dependencies = [
"memchr",
]
[[package]]
name = "object_storage"
version = "0.0.1"
dependencies = [
"anyhow",
"axum",
"axum-extra",
"camino",
"camino-tempfile",
"futures",
"http-body-util",
"itertools 0.10.5",
"jsonwebtoken",
"prometheus",
"rand 0.8.5",
"remote_storage",
"serde",
"serde_json",
"test-log",
"tokio",
"tokio-util",
"tower 0.5.2",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "once_cell"
version = "1.20.2"
@@ -4275,7 +4268,6 @@ dependencies = [
"hyper 0.14.30",
"indoc",
"itertools 0.10.5",
"jsonwebtoken",
"md5",
"metrics",
"nix 0.27.1",
@@ -4285,7 +4277,6 @@ dependencies = [
"pageserver_api",
"pageserver_client",
"pageserver_compaction",
"pem",
"pin-project-lite",
"postgres-protocol",
"postgres-types",
@@ -4353,7 +4344,6 @@ dependencies = [
"humantime-serde",
"itertools 0.10.5",
"nix 0.27.1",
"once_cell",
"postgres_backend",
"postgres_ffi",
"rand 0.8.5",
@@ -4415,9 +4405,9 @@ dependencies = [
[[package]]
name = "papaya"
version = "0.2.1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6827e3fc394523c21d4464d02c0bb1c19966ea4a58a9844ad6d746214179d2bc"
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
dependencies = [
"equivalent",
"seize",
@@ -5204,7 +5194,7 @@ dependencies = [
"walkdir",
"workspace_hack",
"x509-cert",
"zerocopy 0.8.24",
"zerocopy",
]
[[package]]
@@ -5594,7 +5584,7 @@ dependencies = [
"wasm-bindgen-futures",
"wasm-streams",
"web-sys",
"webpki-roots",
"webpki-roots 0.26.1",
"winreg",
]
@@ -5694,9 +5684,9 @@ dependencies = [
[[package]]
name = "ring"
version = "0.17.14"
version = "0.17.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
dependencies = [
"cc",
"cfg-if",
@@ -5997,12 +5987,10 @@ dependencies = [
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
"jsonwebtoken",
"metrics",
"once_cell",
"pageserver_api",
"parking_lot 0.12.1",
"pem",
"postgres-protocol",
"postgres_backend",
"postgres_ffi",
@@ -6195,13 +6183,13 @@ checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
[[package]]
name = "sentry"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
dependencies = [
"httpdate",
"reqwest",
"rustls 0.23.18",
"rustls 0.21.12",
"sentry-backtrace",
"sentry-contexts",
"sentry-core",
@@ -6209,14 +6197,14 @@ dependencies = [
"sentry-tracing",
"tokio",
"ureq",
"webpki-roots",
"webpki-roots 0.25.2",
]
[[package]]
name = "sentry-backtrace"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00293cd332a859961f24fd69258f7e92af736feaeb91020cff84dac4188a4302"
checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
dependencies = [
"backtrace",
"once_cell",
@@ -6226,9 +6214,9 @@ dependencies = [
[[package]]
name = "sentry-contexts"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "961990f9caa76476c481de130ada05614cd7f5aa70fb57c2142f0e09ad3fb2aa"
checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
dependencies = [
"hostname",
"libc",
@@ -6240,9 +6228,9 @@ dependencies = [
[[package]]
name = "sentry-core"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a6409d845707d82415c800290a5d63be5e3df3c2e417b0997c60531dfbd35ef"
checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
dependencies = [
"once_cell",
"rand 0.8.5",
@@ -6253,9 +6241,9 @@ dependencies = [
[[package]]
name = "sentry-panic"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "609b1a12340495ce17baeec9e08ff8ed423c337c1a84dffae36a178c783623f3"
checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
dependencies = [
"sentry-backtrace",
"sentry-core",
@@ -6263,9 +6251,9 @@ dependencies = [
[[package]]
name = "sentry-tracing"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f4e86402d5c50239dc7d8fd3f6d5e048221d5fcb4e026d8d50ab57fe4644cb"
checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
dependencies = [
"sentry-backtrace",
"sentry-core",
@@ -6275,9 +6263,9 @@ dependencies = [
[[package]]
name = "sentry-types"
version = "0.37.0"
version = "0.32.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d3f117b8755dbede8260952de2aeb029e20f432e72634e8969af34324591631"
checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
dependencies = [
"debugid",
"hex",
@@ -6711,6 +6699,8 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"aws-config",
"aws-sdk-s3",
"camino",
"chrono",
"clap",
@@ -7799,7 +7789,7 @@ dependencies = [
"rustls 0.23.18",
"rustls-pki-types",
"url",
"webpki-roots",
"webpki-roots 0.26.1",
]
[[package]]
@@ -7881,7 +7871,6 @@ dependencies = [
"metrics",
"nix 0.27.1",
"once_cell",
"pem",
"pin-project-lite",
"postgres_connection",
"pprof",
@@ -8167,6 +8156,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "webpki-roots"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
[[package]]
name = "webpki-roots"
version = "0.26.1"
@@ -8474,8 +8469,6 @@ dependencies = [
"regex-syntax 0.8.2",
"reqwest",
"rustls 0.23.18",
"rustls-pki-types",
"rustls-webpki 0.102.8",
"scopeguard",
"sec1 0.7.3",
"serde",
@@ -8504,6 +8497,7 @@ dependencies = [
"tracing-log",
"url",
"uuid",
"zerocopy",
"zeroize",
"zstd",
"zstd-safe",
@@ -8607,16 +8601,8 @@ version = "0.7.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
dependencies = [
"zerocopy-derive 0.7.31",
]
[[package]]
name = "zerocopy"
version = "0.8.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
dependencies = [
"zerocopy-derive 0.8.24",
"byteorder",
"zerocopy-derive",
]
[[package]]
@@ -8630,17 +8616,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "zerofrom"
version = "0.1.5"

View File

@@ -40,7 +40,7 @@ members = [
"libs/proxy/postgres-protocol2",
"libs/proxy/postgres-types2",
"libs/proxy/tokio-postgres2",
"endpoint_storage",
"object_storage",
]
[workspace.package]
@@ -141,7 +141,6 @@ parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -164,7 +163,7 @@ scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
send-future = "0.1.0"
sentry = { version = "0.37", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_path_to_error = "0.1"
@@ -175,7 +174,6 @@ signal-hook = "0.3"
smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -220,7 +218,7 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.8"
whoami = "1.5.1"
zerocopy = { version = "0.8", features = ["derive", "simd"] }
zerocopy = { version = "0.7", features = ["derive"] }
json-structural-diff = { version = "0.2.0" }
x509-cert = { version = "0.2.5" }

View File

@@ -89,7 +89,7 @@ RUN set -e \
--bin storage_broker \
--bin storage_controller \
--bin proxy \
--bin endpoint_storage \
--bin object_storage \
--bin neon_local \
--bin storage_scrubber \
--locked --release
@@ -122,7 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin

View File

@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
testing locally, it is convenient to run just one set of permutations, like this:
```sh
DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
```
## Flamegraphs

View File

@@ -173,7 +173,7 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.3.0
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
@@ -206,7 +206,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
&& rm awscliv2.zip
# Mold: A Modern Linker
ENV MOLD_VERSION=v2.37.1
ENV MOLD_VERSION=v2.34.1
RUN set -e \
&& git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \
@@ -268,7 +268,7 @@ WORKDIR /home/nonroot
RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
# Python
ENV PYTHON_VERSION=3.11.12 \
ENV PYTHON_VERSION=3.11.10 \
PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \
@@ -296,12 +296,12 @@ ENV RUSTC_VERSION=1.86.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1
ARG CARGO_HAKARI_VERSION=0.9.36
ARG CARGO_DENY_VERSION=0.18.2
ARG CARGO_HACK_VERSION=0.6.36
ARG CARGO_NEXTEST_VERSION=0.9.94
ARG CARGO_HAKARI_VERSION=0.9.33
ARG CARGO_DENY_VERSION=0.16.2
ARG CARGO_HACK_VERSION=0.6.33
ARG CARGO_NEXTEST_VERSION=0.9.85
ARG CARGO_CHEF_VERSION=0.1.71
ARG CARGO_DIESEL_CLI_VERSION=2.2.9
ARG CARGO_DIESEL_CLI_VERSION=2.2.6
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \

View File

@@ -12,5 +12,3 @@ disallowed-macros = [
# cannot disallow this, because clippy finds used from tokio macros
#"tokio::pin",
]
allow-unwrap-in-tests = true

View File

@@ -1677,7 +1677,7 @@ RUN set -e \
&& apt clean && rm -rf /var/lib/apt/lists/*
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
ENV PGBOUNCER_TAG=pgbouncer_1_24_1
ENV PGBOUNCER_TAG=pgbouncer_1_22_1
RUN set -e \
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
&& cd pgbouncer \

View File

@@ -0,0 +1,265 @@
commit 00aa659afc9c7336ab81036edec3017168aabf40
Author: Heikki Linnakangas <heikki@neon.tech>
Date: Tue Nov 12 16:59:19 2024 +0200
Temporarily disable test that depends on timezone
diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
(1 row)
-SELECT anon.generalize_tstzrange('19041107','millennium');
- generalize_tstzrange
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
generalize_daterange
diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
SELECT anon.generalize_tstzrange('19041107','year');
SELECT anon.generalize_tstzrange('19041107','decade');
SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri May 31 06:34:26 2024 +0000
These alternative expected files were added to consider the neon features
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
new file mode 100644
index 0000000..2539cfd
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
@@ -0,0 +1,101 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE mallory_the_masked_user;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.anonymize_table('t1');
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ERROR: Only supersusers can start the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT * FROM mask.t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ SELECT * FROM public.t1;
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: Only supersusers can stop the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
new file mode 100644
index 0000000..8b090fe
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
@@ -0,0 +1,104 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE oscar_the_owner;
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE oscar_the_owner;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+SELECT anon.anonymize_table('t1');
+ anonymize_table
+-----------------
+ t
+(1 row)
+
+SELECT * FROM t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+UPDATE t1 SET t='test' WHERE i=1;
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+SELECT * FROM t1;
+ i | t
+---+------
+ 1 | test
+(1 row)
+
+--SELECT * FROM mask.t1;
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: permission denied for schema mask
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
+SQL statement "SELECT anon.mask_drop_view(oid)
+ FROM pg_catalog.pg_class
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
+ AND relkind IN ('r','p','f')"
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;

View File

@@ -11,14 +11,6 @@ index bf6edcb..89b4c7f 100644
USE_PGXS = 1 # use pgxs if not in contrib directory
PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
index 9f2e171..f6e4f8d 100644
--- a/regress/expected/init-extension.out
+++ b/regress/expected/init-extension.out
@@ -1,3 +1,2 @@
SET client_min_messages = warning;
CREATE EXTENSION pg_repack;
-RESET client_min_messages;
diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
index 8d0a94e..63b68bf 100644
--- a/regress/expected/nosuper.out
@@ -50,14 +42,6 @@ index 8d0a94e..63b68bf 100644
INFO: repacking table "public.tbl_cluster"
ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block
DETAIL: query was: RESET lock_timeout
diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
index 9f2e171..f6e4f8d 100644
--- a/regress/sql/init-extension.sql
+++ b/regress/sql/init-extension.sql
@@ -1,3 +1,2 @@
SET client_min_messages = warning;
CREATE EXTENSION pg_repack;
-RESET client_min_messages;
diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
index 072f0fa..dbe60f8 100644
--- a/regress/sql/nosuper.sql

View File

@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index b667478..1298aa1 100644
index b667478..dc95d89 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -36,7 +36,7 @@ index b667478..1298aa1 100644
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
@@ -48,17 +48,32 @@ index b667478..1298aa1 100644
BuildGraph(buildstate, forkNum);
- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+ if (set_lwlsn_block_range_hook)
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ if (set_lwlsn_relation_hook)
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
+ }
+
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}

View File

@@ -1,5 +1,5 @@
diff --git a/src/ruminsert.c b/src/ruminsert.c
index 255e616..1c6edb7 100644
index 255e616..7a2240f 100644
--- a/src/ruminsert.c
+++ b/src/ruminsert.c
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -24,12 +24,24 @@ index 255e616..1c6edb7 100644
/*
* Write index to xlog
*/
@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
UnlockReleaseBuffer(buffer);
}
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(index->rd_smgr);
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+ if (set_lwlsn_block_range_hook)
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ if (set_lwlsn_relation_hook)
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+
+ smgr_end_unlogged_build(index->rd_smgr);
+ }
+#endif
+
/*

View File

@@ -22,7 +22,7 @@ commands:
- name: local_proxy
user: postgres
sysvInitAction: respawn
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
- name: postgres-exporter
user: nobody
sysvInitAction: respawn

View File

@@ -22,7 +22,7 @@ commands:
- name: local_proxy
user: postgres
sysvInitAction: respawn
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
- name: postgres-exporter
user: nobody
sysvInitAction: respawn

View File

@@ -29,12 +29,13 @@
//! ```sh
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -c /var/db/postgres/configs/config.json \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::mpsc;
use std::thread;
@@ -42,7 +43,8 @@ use std::time::Duration;
use anyhow::{Context, Result};
use clap::Parser;
use compute_api::responses::ComputeConfig;
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
};
@@ -57,13 +59,24 @@ use tracing::{error, info};
use url::Url;
use utils::failpoint_support;
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
fn parse_remote_ext_config(arg: &str) -> Result<String> {
if arg.starts_with("http") {
Ok(arg.trim_end_matches('/').to_string())
} else {
Ok("http://pg-ext-s3-gateway".to_string())
}
}
#[derive(Parser)]
#[command(rename_all = "kebab-case")]
struct Cli {
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
pub pgbin: String,
#[arg(short = 'r', long)]
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
pub remote_ext_config: Option<String>,
/// The port to bind the external listening HTTP server to. Clients running
@@ -105,8 +118,8 @@ struct Cli {
#[arg(long)]
pub set_disk_quota_for_fs: Option<String>,
#[arg(short = 'c', long)]
pub config: Option<OsString>,
#[arg(short = 'S', long, group = "spec-path")]
pub spec_path: Option<OsString>,
#[arg(short = 'i', long, group = "compute-id")]
pub compute_id: String,
@@ -114,9 +127,8 @@ struct Cli {
#[arg(
short = 'p',
long,
conflicts_with = "config",
value_name = "CONTROL_PLANE_API_BASE_URL",
requires = "compute-id"
conflicts_with = "spec-path",
value_name = "CONTROL_PLANE_API_BASE_URL"
)]
pub control_plane_uri: Option<String>,
}
@@ -126,7 +138,7 @@ fn main() -> Result<()> {
let scenario = failpoint_support::init();
// For historical reasons, the main thread that processes the config and launches postgres
// For historical reasons, the main thread that processes the spec and launches postgres
// is synchronous, but we always have this tokio runtime available and we "enter" it so
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
// from all parts of compute_ctl.
@@ -142,7 +154,7 @@ fn main() -> Result<()> {
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let config = get_config(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
@@ -163,7 +175,8 @@ fn main() -> Result<()> {
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
},
config,
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
let exit_code = compute_node.run()?;
@@ -188,17 +201,27 @@ async fn init() -> Result<()> {
Ok(())
}
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
// First, read the config from the path if provided
if let Some(ref config) = cli.config {
let file = File::open(config)?;
return Ok(serde_json::from_reader(&file)?);
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, read spec from the path if provided
if let Some(ref spec_path) = cli.spec_path {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
compute_ctl_config: ComputeCtlConfig::default(),
});
}
// If the config wasn't provided in the CLI arguments, then retrieve it from
if cli.control_plane_uri.is_none() {
panic!("must specify --control-plane-uri");
};
// If the spec wasn't provided in the CLI arguments, then retrieve it from
// the control plane
match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(config) => Ok(config),
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(resp) => Ok(CliSpecParams {
spec: resp.0,
compute_ctl_config: resp.1,
}),
Err(e) => {
error!(
"cannot get response from control plane: {}\n\
@@ -210,6 +233,13 @@ fn get_config(cli: &Cli) -> Result<ComputeConfig> {
}
}
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
#[allow(dead_code)]
compute_ctl_config: ComputeCtlConfig,
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may

View File

@@ -11,7 +11,7 @@ use std::{env, fs};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use compute_api::privilege::Privilege;
use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
};
@@ -303,7 +303,11 @@ struct StartVmMonitorResult {
}
impl ComputeNode {
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
pub fn new(
params: ComputeNodeParams,
cli_spec: Option<ComputeSpec>,
compute_ctl_config: ComputeCtlConfig,
) -> Result<Self> {
let connstr = params.connstr.as_str();
let conn_conf = postgres::config::Config::from_str(connstr)
.context("cannot build postgres config from connstr")?;
@@ -311,8 +315,8 @@ impl ComputeNode {
.context("cannot build tokio postgres config from connstr")?;
let mut new_state = ComputeState::new();
if let Some(spec) = config.spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
if let Some(cli_spec) = cli_spec {
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
}
@@ -323,7 +327,7 @@ impl ComputeNode {
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_download_progress: RwLock::new(HashMap::new()),
compute_ctl_config: config.compute_ctl_config,
compute_ctl_config,
})
}
@@ -519,14 +523,11 @@ impl ComputeNode {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id,
pspec.timeline_id,
pspec.spec.project_id.as_deref().unwrap_or("None"),
pspec.spec.branch_id.as_deref().unwrap_or("None"),
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
pspec.spec.features,
pspec.spec.remote_extensions,
);
@@ -630,42 +631,19 @@ impl ComputeNode {
});
}
// Configure and start rsyslog for compliance audit logging
match pspec.spec.audit_log_level {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
let remote_endpoint =
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
// Add project_id,endpoint_id tag to identify the logs.
//
// These ids are passed from cplane,
// for backwards compatibility (old computes that don't have them),
// we set them to None.
// TODO: Clean up this code when all computes have them.
let tag: Option<String> = match (
pspec.spec.project_id.as_deref(),
pspec.spec.endpoint_id.as_deref(),
) {
(Some(project_id), Some(endpoint_id)) => {
Some(format!("{project_id}/{endpoint_id}"))
}
(Some(project_id), None) => Some(format!("{project_id}/None")),
(None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
(None, None) => None,
};
configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
// Configure and start rsyslog for HIPAA if necessary
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
_ => {}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
// Configure and start rsyslog for Postgres logs export

View File

@@ -89,15 +89,6 @@ pub fn write_postgres_conf(
escape_conf_value(&s.to_string())
)?;
}
if let Some(s) = &spec.project_id {
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.branch_id {
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.endpoint_id {
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
}
// tls
if let Some(tls_config) = tls_config {
@@ -178,7 +169,7 @@ pub fn write_postgres_conf(
// and don't allow the user or the control plane admin to change them.
match spec.audit_log_level {
ComputeAudit::Disabled => {}
ComputeAudit::Log | ComputeAudit::Base => {
ComputeAudit::Log => {
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
writeln!(file, "pgaudit.log='ddl,role'")?;
// Disable logging of catalog queries to reduce the noise
@@ -202,20 +193,16 @@ pub fn write_postgres_conf(
}
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
}
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
ComputeAudit::Hipaa => {
writeln!(
file,
"# Managed by compute_ctl compliance audit settings: begin"
)?;
// Enable logging of parameters.
// This is very verbose and may contain sensitive data.
if spec.audit_log_level == ComputeAudit::Full {
writeln!(file, "pgaudit.log_parameter=on")?;
writeln!(file, "pgaudit.log='all'")?;
} else {
writeln!(file, "pgaudit.log_parameter=off")?;
writeln!(file, "pgaudit.log='all, -misc'")?;
}
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;

View File

@@ -6,5 +6,4 @@ pub(crate) mod request_id;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;
#[allow(unused)]
pub(crate) use request_id::RequestId;

View File

@@ -1,7 +1,7 @@
use std::collections::HashSet;
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
@@ -11,9 +11,9 @@ use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::{debug, warn};
use tracing::warn;
use crate::http::JsonResponse;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
@@ -52,6 +52,31 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
let validation = self.validation.clone();
Box::pin(async move {
let request_id = request.extract_parts::<RequestId>().await.unwrap();
// TODO: Remove this stanza after teaching neon_local and the
// regression tests to use a JWT + JWKS.
//
// https://github.com/neondatabase/neon/issues/11316
if cfg!(feature = "testing") {
warn!(%request_id, "Skipping compute_ctl authorization check");
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await
@@ -67,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid compute ID in authorization token claims",
"invalid claims in authorization token",
));
}
@@ -87,16 +112,12 @@ impl Authorize {
token: &str,
validation: &Validation,
) -> Result<TokenData<ComputeClaims>> {
debug_assert!(!jwks.keys.is_empty());
debug!("verifying token {}", token);
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"failed to construct decoding key from {}: {}",
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
@@ -109,7 +130,7 @@ impl Authorize {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"failed to decode authorization token using {}: {}",
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
@@ -119,6 +140,6 @@ impl Authorize {
}
}
Err(anyhow!("failed to verify authorization token"))
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -1,8 +1,8 @@
use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
use metrics::core::{AtomicF64, Collector, GenericGauge};
use metrics::proto::MetricFamily;
use metrics::{
IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
register_int_gauge_vec, register_uint_gauge_vec,
};
use once_cell::sync::Lazy;
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
// And it's fair to call it a 'RPC' (Remote Procedure Call).
pub enum CPlaneRequestRPC {
GetConfig,
GetSpec,
}
impl CPlaneRequestRPC {
pub fn as_str(&self) -> &str {
match self {
CPlaneRequestRPC::GetConfig => "GetConfig",
CPlaneRequestRPC::GetSpec => "GetSpec",
}
}
}
@@ -81,22 +81,6 @@ pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static PG_CURR_DOWNTIME_MS: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
register_gauge!(
"compute_pg_current_downtime_ms",
"Non-cumulative duration of Postgres downtime in ms; resets after successful check",
)
.expect("failed to define a metric")
});
pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::new(|| {
register_int_counter!(
"compute_pg_downtime_ms_total",
"Cumulative duration of Postgres downtime in ms",
)
.expect("failed to define a metric")
});
pub fn collect() -> Vec<MetricFamily> {
let mut metrics = COMPUTE_CTL_UP.collect();
metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -104,7 +88,5 @@ pub fn collect() -> Vec<MetricFamily> {
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
metrics.extend(DB_MIGRATION_FAILED.collect());
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
metrics.extend(PG_CURR_DOWNTIME_MS.collect());
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
metrics
}

View File

@@ -6,294 +6,197 @@ use chrono::{DateTime, Utc};
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeFeature;
use postgres::{Client, NoTls};
use tracing::{Level, error, info, instrument, span};
use tracing::{debug, error, info, warn};
use crate::compute::ComputeNode;
use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
struct ComputeMonitor {
compute: Arc<ComputeNode>,
// Spin in a loop and figure out the last activity time in the Postgres.
// Then update it in the shared state. This function never errors out.
// NB: the only expected panic is at `Mutex` unwrap(), all other errors
// should be handled gracefully.
fn watch_compute_activity(compute: &ComputeNode) {
// Suppose that `connstr` doesn't change
let connstr = compute.params.connstr.clone();
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
/// The moment when Postgres had some activity,
/// that should prevent compute from being suspended.
last_active: Option<DateTime<Utc>>,
// During startup and configuration we connect to every Postgres database,
// but we don't want to count this as some user activity. So wait until
// the compute fully started before monitoring activity.
wait_for_postgres_start(compute);
/// The moment when we last tried to check Postgres.
last_checked: DateTime<Utc>,
/// The last moment we did a successful Postgres check.
last_up: DateTime<Utc>,
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = conf.connect(NoTls);
/// Only used for internal statistics change tracking
/// between monitor runs and can be outdated.
active_time: Option<f64>,
/// Only used for internal statistics change tracking
/// between monitor runs and can be outdated.
sessions: Option<i64>,
let mut sleep = false;
let mut prev_active_time: Option<f64> = None;
let mut prev_sessions: Option<i64> = None;
/// Use experimental statistics-based activity monitor. It's no longer
/// 'experimental' per se, as it's enabled for everyone, but we still
/// keep the flag as an option to turn it off in some cases if it will
/// misbehave.
experimental: bool,
}
impl ComputeMonitor {
fn report_down(&self) {
let now = Utc::now();
// Calculate and report current downtime
// (since the last time Postgres was up)
let downtime = now.signed_duration_since(self.last_up);
PG_CURR_DOWNTIME_MS.set(downtime.num_milliseconds() as f64);
// Calculate and update total downtime
// (cumulative duration of Postgres downtime in ms)
let inc = now
.signed_duration_since(self.last_checked)
.num_milliseconds();
PG_TOTAL_DOWNTIME_MS.inc_by(inc as u64);
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
info!("starting experimental activity monitor for {}", connstr);
} else {
info!("starting activity monitor for {}", connstr);
}
fn report_up(&mut self) {
self.last_up = Utc::now();
PG_CURR_DOWNTIME_MS.set(0.0);
}
fn downtime_info(&self) -> String {
format!(
"total_ms: {}, current_ms: {}, last_up: {}",
PG_TOTAL_DOWNTIME_MS.get(),
PG_CURR_DOWNTIME_MS.get(),
self.last_up
)
}
/// Spin in a loop and figure out the last activity time in the Postgres.
/// Then update it in the shared state. This function never errors out.
/// NB: the only expected panic is at `Mutex` unwrap(), all other errors
/// should be handled gracefully.
#[instrument(skip_all)]
pub fn run(&mut self) {
// Suppose that `connstr` doesn't change
let connstr = self.compute.params.connstr.clone();
let conf = self
.compute
.get_conn_conf(Some("compute_ctl:compute_monitor"));
// During startup and configuration we connect to every Postgres database,
// but we don't want to count this as some user activity. So wait until
// the compute fully started before monitoring activity.
wait_for_postgres_start(&self.compute);
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = conf.connect(NoTls);
info!("starting compute monitor for {}", connstr);
loop {
match &mut client {
Ok(cli) => {
if cli.is_closed() {
info!(
downtime_info = self.downtime_info(),
"connection to Postgres is closed, trying to reconnect"
);
self.report_down();
// Connection is closed, reconnect and try again.
client = conf.connect(NoTls);
} else {
match self.check(cli) {
Ok(_) => {
self.report_up();
self.compute.update_last_active(self.last_active);
}
Err(e) => {
// Although we have many places where we can return errors in `check()`,
// normally it shouldn't happen. I.e., we will likely return error if
// connection got broken, query timed out, Postgres returned invalid data, etc.
// In all such cases it's suspicious, so let's report this as downtime.
self.report_down();
error!(
downtime_info = self.downtime_info(),
"could not check Postgres: {}", e
);
// Reconnect to Postgres just in case. During tests, I noticed
// that queries in `check()` can fail with `connection closed`,
// but `cli.is_closed()` above doesn't detect it. Even if old
// connection is still alive, it will be dropped when we reassign
// `client` to a new connection.
client = conf.connect(NoTls);
}
}
}
}
Err(e) => {
info!(
downtime_info = self.downtime_info(),
"could not connect to Postgres: {}, retrying", e
);
self.report_down();
// Establish a new connection and try again.
client = conf.connect(NoTls);
}
}
// Reset the `last_checked` timestamp and sleep before the next iteration.
self.last_checked = Utc::now();
loop {
// We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
// But skip the first sleep, so we can connect to Postgres immediately.
if sleep {
// Should be outside of the mutex lock to allow others to read while we sleep.
thread::sleep(MONITOR_CHECK_INTERVAL);
} else {
sleep = true;
}
}
#[instrument(skip_all)]
fn check(&mut self, cli: &mut Client) -> anyhow::Result<()> {
// This is new logic, only enable if the feature flag is set.
// TODO: remove this once we are sure that it works OR drop it altogether.
if self.experimental {
// Check if the total active time or sessions across all databases has changed.
// If it did, it means that user executed some queries. In theory, it can even go down if
// some databases were dropped, but it's still user activity.
match get_database_stats(cli) {
Ok((active_time, sessions)) => {
let mut detected_activity = false;
match &mut client {
Ok(cli) => {
if cli.is_closed() {
info!("connection to Postgres is closed, trying to reconnect");
if let Some(prev_active_time) = self.active_time {
if active_time != prev_active_time {
detected_activity = true;
// Connection is closed, reconnect and try again.
client = conf.connect(NoTls);
continue;
}
// This is a new logic, only enable if the feature flag is set.
// TODO: remove this once we are sure that it works OR drop it altogether.
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
// First, check if the total active time or sessions across all databases has changed.
// If it did, it means that user executed some queries. In theory, it can even go down if
// some databases were dropped, but it's still a user activity.
match get_database_stats(cli) {
Ok((active_time, sessions)) => {
let mut detected_activity = false;
prev_active_time = match prev_active_time {
Some(prev_active_time) => {
if active_time != prev_active_time {
detected_activity = true;
}
Some(active_time)
}
None => Some(active_time),
};
prev_sessions = match prev_sessions {
Some(prev_sessions) => {
if sessions != prev_sessions {
detected_activity = true;
}
Some(sessions)
}
None => Some(sessions),
};
if detected_activity {
// Update the last active time and continue, we don't need to
// check backends state change.
compute.update_last_active(Some(Utc::now()));
continue;
}
}
Err(e) => {
error!("could not get database statistics: {}", e);
continue;
}
}
self.active_time = Some(active_time);
}
if let Some(prev_sessions) = self.sessions {
if sessions != prev_sessions {
detected_activity = true;
// Second, if database statistics is the same, check all backends state change,
// maybe there is some with more recent activity. `get_backends_state_change()`
// can return None or stale timestamp, so it's `compute.update_last_active()`
// responsibility to check if the new timestamp is more recent than the current one.
// This helps us to discover new sessions, that did nothing yet.
match get_backends_state_change(cli) {
Ok(last_active) => {
compute.update_last_active(last_active);
}
Err(e) => {
error!("could not get backends state change: {}", e);
}
}
// Finally, if there are existing (logical) walsenders, do not suspend.
//
// walproposer doesn't currently show up in pg_stat_replication,
// but protect if it will be
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
match cli.query_one(ws_count_query, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_ws) => {
if num_ws > 0 {
compute.update_last_active(Some(Utc::now()));
continue;
}
}
}
self.sessions = Some(sessions);
if detected_activity {
// Update the last active time and continue, we don't need to
// check backends state change.
self.last_active = Some(Utc::now());
return Ok(());
Err(e) => {
warn!("failed to parse walsenders count: {:?}", e);
continue;
}
},
Err(e) => {
warn!("failed to get list of walsenders: {:?}", e);
continue;
}
}
Err(e) => {
return Err(anyhow::anyhow!("could not get database statistics: {}", e));
//
// Don't suspend compute if there is an active logical replication subscription
//
// `where pid is not null` to filter out read only computes and subscription on branches
//
let logical_subscriptions_query =
"select count(*) from pg_stat_subscription where pid is not null;";
match cli.query_one(logical_subscriptions_query, &[]) {
Ok(row) => match row.try_get::<&str, i64>("count") {
Ok(num_subscribers) => {
if num_subscribers > 0 {
compute.update_last_active(Some(Utc::now()));
continue;
}
}
Err(e) => {
warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
continue;
}
},
Err(e) => {
warn!(
"failed to get list of active logical replication subscriptions: {:?}",
e
);
continue;
}
}
//
// Do not suspend compute if autovacuum is running
//
let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
match cli.query_one(autovacuum_count_query, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_workers) => {
if num_workers > 0 {
compute.update_last_active(Some(Utc::now()));
continue;
}
}
Err(e) => {
warn!("failed to parse autovacuum workers count: {:?}", e);
continue;
}
},
Err(e) => {
warn!("failed to get list of autovacuum workers: {:?}", e);
continue;
}
}
}
}
// If database statistics are the same, check all backends for state changes.
// Maybe there are some with more recent activity. `get_backends_state_change()`
// can return None or stale timestamp, so it's `compute.update_last_active()`
// responsibility to check if the new timestamp is more recent than the current one.
// This helps us to discover new sessions that have not done anything yet.
match get_backends_state_change(cli) {
Ok(last_active) => match (last_active, self.last_active) {
(Some(last_active), Some(prev_last_active)) => {
if last_active > prev_last_active {
self.last_active = Some(last_active);
return Ok(());
}
}
(Some(last_active), None) => {
self.last_active = Some(last_active);
return Ok(());
}
_ => {}
},
Err(e) => {
return Err(anyhow::anyhow!(
"could not get backends state change: {}",
e
));
debug!("could not connect to Postgres: {}, retrying", e);
// Establish a new connection and try again.
client = conf.connect(NoTls);
}
}
// If there are existing (logical) walsenders, do not suspend.
//
// N.B. walproposer doesn't currently show up in pg_stat_replication,
// but protect if it will.
const WS_COUNT_QUERY: &str =
"select count(*) from pg_stat_replication where application_name != 'walproposer';";
match cli.query_one(WS_COUNT_QUERY, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_ws) => {
if num_ws > 0 {
self.last_active = Some(Utc::now());
return Ok(());
}
}
Err(e) => {
let err: anyhow::Error = e.into();
return Err(err.context("failed to parse walsenders count"));
}
},
Err(e) => {
return Err(anyhow::anyhow!("failed to get list of walsenders: {}", e));
}
}
// Don't suspend compute if there is an active logical replication subscription
//
// `where pid is not null` to filter out read only computes and subscription on branches
const LOGICAL_SUBSCRIPTIONS_QUERY: &str =
"select count(*) from pg_stat_subscription where pid is not null;";
match cli.query_one(LOGICAL_SUBSCRIPTIONS_QUERY, &[]) {
Ok(row) => match row.try_get::<&str, i64>("count") {
Ok(num_subscribers) => {
if num_subscribers > 0 {
self.last_active = Some(Utc::now());
return Ok(());
}
}
Err(e) => {
return Err(anyhow::anyhow!(
"failed to parse 'pg_stat_subscription' count: {}",
e
));
}
},
Err(e) => {
return Err(anyhow::anyhow!(
"failed to get list of active logical replication subscriptions: {}",
e
));
}
}
// Do not suspend compute if autovacuum is running
const AUTOVACUUM_COUNT_QUERY: &str =
"select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
match cli.query_one(AUTOVACUUM_COUNT_QUERY, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_workers) => {
if num_workers > 0 {
self.last_active = Some(Utc::now());
return Ok(());
};
}
Err(e) => {
return Err(anyhow::anyhow!(
"failed to parse autovacuum workers count: {}",
e
));
}
},
Err(e) => {
return Err(anyhow::anyhow!(
"failed to get list of autovacuum workers: {}",
e
));
}
}
Ok(())
}
}
@@ -412,24 +315,9 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
/// Launch a separate compute monitor thread and return its `JoinHandle`.
pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
let compute = Arc::clone(compute);
let experimental = compute.has_feature(ComputeFeature::ActivityMonitorExperimental);
let now = Utc::now();
let mut monitor = ComputeMonitor {
compute,
last_active: None,
last_checked: now,
last_up: now,
active_time: None,
sessions: None,
experimental,
};
let span = span!(Level::INFO, "compute_monitor");
thread::Builder::new()
.name("compute-monitor".into())
.spawn(move || {
let _enter = span.enter();
monitor.run();
})
.spawn(move || watch_compute_activity(&compute))
.expect("cannot launch compute monitor thread")
}

View File

@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {
pub fn configure_audit_rsyslog(
log_directory: String,
tag: Option<String>,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag.unwrap_or("".to_string()),
tag = tag,
remote_endpoint = remote_endpoint
);

View File

@@ -3,8 +3,9 @@ use std::path::Path;
use anyhow::{Result, anyhow, bail};
use compute_api::responses::{
ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::spec::ComputeSpec;
use reqwest::StatusCode;
use tokio_postgres::Client;
use tracing::{error, info, instrument};
@@ -20,7 +21,7 @@ use crate::params::PG_HBA_ALL_MD5;
fn do_control_plane_request(
uri: &str,
jwt: &str,
) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
let resp = reqwest::blocking::Client::new()
.get(uri)
.header("Authorization", format!("Bearer {}", jwt))
@@ -28,14 +29,14 @@ fn do_control_plane_request(
.map_err(|e| {
(
true,
format!("could not perform request to control plane: {:?}", e),
format!("could not perform spec request to control plane: {:?}", e),
UNKNOWN_HTTP_STATUS.to_string(),
)
})?;
let status = resp.status();
match status {
StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
Ok(spec_resp) => Ok(spec_resp),
Err(e) => Err((
true,
@@ -68,35 +69,40 @@ fn do_control_plane_request(
}
}
/// Request config from the control-plane by compute_id. If
/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
/// authorization.
pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
/// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
info!("getting config from control plane: {}", cp_uri);
info!("getting spec from control plane: {}", cp_uri);
// Do 3 attempts to get spec from the control plane using the following logic:
// - network error -> then retry
// - compute id is unknown or any other error -> bail out
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got config -> return Ok(Some(config))
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
let result = match do_control_plane_request(&cp_uri, &jwt) {
Ok(config_resp) => {
Ok(spec_resp) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[
CPlaneRequestRPC::GetConfig.as_str(),
CPlaneRequestRPC::GetSpec.as_str(),
&StatusCode::OK.to_string(),
])
.inc();
match config_resp.status {
ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
ControlPlaneComputeStatus::Attached => {
if config_resp.spec.is_some() {
Ok(config_resp.into())
if let Some(spec) = spec_resp.spec {
Ok((Some(spec), spec_resp.compute_ctl_config))
} else {
bail!("compute is attached, but spec is empty")
}
@@ -105,7 +111,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
}
Err((retry, msg, status)) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
.inc();
if retry {
Err(anyhow!(msg))
@@ -116,7 +122,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
};
if let Err(e) = &result {
error!("attempt {} to get config failed with: {}", attempt, e);
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return result;
}
@@ -127,13 +133,13 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
// All attempts failed, return error.
Err(anyhow::anyhow!(
"Exhausted all attempts to retrieve the config from the control plane"
"Exhausted all attempts to retrieve the spec from the control plane"
))
}
/// Check `pg_hba.conf` and update if needed to allow external connections.
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of config.json
// XXX: consider making it a part of spec.json
let pghba_path = pgdata_path.join("pg_hba.conf");
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -147,7 +153,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
/// Create a standby.signal file
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of config.json
// XXX: consider making it a part of spec.json
let signalfile = pgdata_path.join("standby.signal");
if !signalfile.exists() {

View File

@@ -278,12 +278,12 @@ impl ComputeNode {
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
ComputeAudit::Hipaa => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log | ComputeAudit::Base => {
ComputeAudit::Log => {
phases.push(CreatePgauditExtension);
phases.push(DisablePostgresDBPgAudit);
}

View File

@@ -6,16 +6,13 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
base64.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
humantime.workspace = true
jsonwebtoken.workspace = true
nix.workspace = true
once_cell.workspace = true
pem.workspace = true
humantime-serde.workspace = true
hyper0.workspace = true
regex.workspace = true
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
spki.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true

View File

@@ -18,11 +18,12 @@ use anyhow::{Context, Result, anyhow, bail};
use clap::Parser;
use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
use control_plane::local_env::{
EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
NeonLocalInitPageserverConf, SafekeeperConf,
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
ObjectStorageConf, SafekeeperConf,
};
use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
use control_plane::object_storage::ObjectStorage;
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::{
@@ -62,7 +63,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: u32 = 17;
const DEFAULT_PG_VERSION: u32 = 16;
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
@@ -92,7 +93,7 @@ enum NeonLocalCmd {
#[command(subcommand)]
Safekeeper(SafekeeperCmd),
#[command(subcommand)]
EndpointStorage(EndpointStorageCmd),
ObjectStorage(ObjectStorageCmd),
#[command(subcommand)]
Endpoint(EndpointCmd),
#[command(subcommand)]
@@ -459,14 +460,14 @@ enum SafekeeperCmd {
#[derive(clap::Subcommand)]
#[clap(about = "Manage object storage")]
enum EndpointStorageCmd {
Start(EndpointStorageStartCmd),
Stop(EndpointStorageStopCmd),
enum ObjectStorageCmd {
Start(ObjectStorageStartCmd),
Stop(ObjectStorageStopCmd),
}
#[derive(clap::Args)]
#[clap(about = "Start object storage")]
struct EndpointStorageStartCmd {
struct ObjectStorageStartCmd {
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
@@ -474,7 +475,7 @@ struct EndpointStorageStartCmd {
#[derive(clap::Args)]
#[clap(about = "Stop object storage")]
struct EndpointStorageStopCmd {
struct ObjectStorageStopCmd {
#[arg(value_enum, default_value = "fast")]
#[clap(
short = 'm',
@@ -551,7 +552,6 @@ enum EndpointCmd {
Start(EndpointStartCmdArgs),
Reconfigure(EndpointReconfigureCmdArgs),
Stop(EndpointStopCmdArgs),
GenerateJwt(EndpointGenerateJwtCmdArgs),
}
#[derive(clap::Args)]
@@ -699,13 +699,6 @@ struct EndpointStopCmdArgs {
mode: String,
}
#[derive(clap::Args)]
#[clap(about = "Generate a JWT for an endpoint")]
struct EndpointGenerateJwtCmdArgs {
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
}
#[derive(clap::Subcommand)]
#[clap(about = "Manage neon_local branch name mappings")]
enum MappingsCmd {
@@ -796,9 +789,7 @@ fn main() -> Result<()> {
}
NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
NeonLocalCmd::EndpointStorage(subcmd) => {
rt.block_on(handle_endpoint_storage(&subcmd, env))
}
NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
};
@@ -1015,8 +1006,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
}
})
.collect(),
endpoint_storage: EndpointStorageConf {
port: ENDPOINT_STORAGE_DEFAULT_PORT,
object_storage: ObjectStorageConf {
port: OBJECT_STORAGE_DEFAULT_PORT,
},
pg_distrib_dir: None,
neon_distrib_dir: None,
@@ -1537,16 +1528,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
endpoint.stop(&args.mode, args.destroy)?;
}
EndpointCmd::GenerateJwt(args) => {
let endpoint_id = &args.endpoint_id;
let endpoint = cplane
.endpoints
.get(endpoint_id)
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let jwt = endpoint.generate_jwt()?;
print!("{jwt}");
}
}
Ok(())
@@ -1736,15 +1717,12 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_endpoint_storage(
subcmd: &EndpointStorageCmd,
env: &local_env::LocalEnv,
) -> Result<()> {
use EndpointStorageCmd::*;
let storage = EndpointStorage::from_env(env);
async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
use ObjectStorageCmd::*;
let storage = ObjectStorage::from_env(env);
// In tests like test_forward_compatibility or test_graceful_cluster_restart
// old neon binaries (without endpoint_storage) are present
// old neon binaries (without object_storage) are present
if !storage.bin.exists() {
eprintln!(
"{} binary not found. Ignore if this is a compatibility test",
@@ -1754,13 +1732,13 @@ async fn handle_endpoint_storage(
}
match subcmd {
Start(EndpointStorageStartCmd { start_timeout }) => {
Start(ObjectStorageStartCmd { start_timeout }) => {
if let Err(e) = storage.start(start_timeout).await {
eprintln!("endpoint_storage start failed: {e}");
eprintln!("object_storage start failed: {e}");
exit(1);
}
}
Stop(EndpointStorageStopCmd { stop_mode }) => {
Stop(ObjectStorageStopCmd { stop_mode }) => {
let immediate = match stop_mode {
StopMode::Fast => false,
StopMode::Immediate => true,
@@ -1870,10 +1848,10 @@ async fn handle_start_all_impl(
}
js.spawn(async move {
EndpointStorage::from_env(env)
ObjectStorage::from_env(env)
.start(&retry_timeout)
.await
.map_err(|e| e.context("start endpoint_storage"))
.map_err(|e| e.context("start object_storage"))
});
})();
@@ -1972,9 +1950,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
}
}
let storage = EndpointStorage::from_env(env);
let storage = ObjectStorage::from_env(env);
if let Err(e) = storage.stop(immediate) {
eprintln!("endpoint_storage stop failed: {:#}", e);
eprintln!("object_storage stop failed: {:#}", e);
}
for ps_conf in &env.pageservers {

View File

@@ -29,7 +29,7 @@
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
//! postgresql.conf - postgresql settings
//! config.json - passed to `compute_ctl`
//! spec.json - passed to `compute_ctl`
//! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal
@@ -42,30 +42,20 @@ use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, anyhow, bail};
use compute_api::requests::{ComputeClaims, ConfigurationRequest};
use compute_api::responses::{
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
};
use jsonwebtoken::jwk::{
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
};
use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use pem::Pem;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -90,7 +80,6 @@ pub struct EndpointConf {
drop_subscriptions_before_start: bool,
features: Vec<ComputeFeature>,
cluster: Option<Cluster>,
compute_ctl_config: ComputeCtlConfig,
}
//
@@ -146,37 +135,6 @@ impl ComputeControlPlane {
.unwrap_or(self.base_port)
}
/// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
/// from the production control plane.
fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
let public_key = spki.subject_public_key.raw_bytes();
let mut hasher = Sha256::new();
hasher.update(public_key);
let key_hash = hasher.finalize();
Ok(JwkSet {
keys: vec![Jwk {
common: CommonParameters {
public_key_use: Some(PublicKeyUse::Signature),
key_operations: Some(vec![KeyOperations::Verify]),
key_algorithm: Some(KeyAlgorithm::EdDSA),
key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
x509_url: None::<String>,
x509_chain: None::<Vec<String>>,
x509_sha1_fingerprint: None::<String>,
x509_sha256_fingerprint: None::<String>,
},
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
key_type: OctetKeyPairType::OctetKeyPair,
curve: EllipticCurve::Ed25519,
x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
}),
}],
})
}
#[allow(clippy::too_many_arguments)]
pub fn new_endpoint(
&mut self,
@@ -194,10 +152,6 @@ impl ComputeControlPlane {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
let compute_ctl_config = ComputeCtlConfig {
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
tls: None::<TlsConfig>,
};
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
@@ -225,7 +179,6 @@ impl ComputeControlPlane {
reconfigure_concurrency: 1,
features: vec![],
cluster: None,
compute_ctl_config: compute_ctl_config.clone(),
});
ep.create_endpoint_dir()?;
@@ -245,7 +198,6 @@ impl ComputeControlPlane {
reconfigure_concurrency: 1,
features: vec![],
cluster: None,
compute_ctl_config,
})?,
)?;
std::fs::write(
@@ -288,6 +240,7 @@ impl ComputeControlPlane {
///////////////////////////////////////////////////////////////////////////////
#[derive(Debug)]
pub struct Endpoint {
/// used as the directory name
endpoint_id: String,
@@ -316,9 +269,6 @@ pub struct Endpoint {
features: Vec<ComputeFeature>,
// Cluster settings
cluster: Option<Cluster>,
/// The compute_ctl config for the endpoint's compute.
compute_ctl_config: ComputeCtlConfig,
}
#[derive(PartialEq, Eq)]
@@ -381,7 +331,6 @@ impl Endpoint {
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
features: conf.features,
cluster: conf.cluster,
compute_ctl_config: conf.compute_ctl_config,
})
}
@@ -629,13 +578,6 @@ impl Endpoint {
Ok(safekeeper_connstrings)
}
/// Generate a JWT with the correct claims.
pub fn generate_jwt(&self) -> Result<String> {
self.env.generate_auth_token(&ComputeClaims {
compute_id: self.endpoint_id.clone(),
})
}
#[allow(clippy::too_many_arguments)]
pub async fn start(
&self,
@@ -677,97 +619,87 @@ impl Endpoint {
remote_extensions = None;
};
// Create config file
let config = {
let mut spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
disk_quota_bytes: None,
disable_lfc_resizing: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
}]
} else {
Vec::new()
},
settings: None,
postgresql_conf: Some(postgresql_conf.clone()),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
project_id: None,
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
logs_export_host: None::<String>,
};
// this strange code is needed to support respec() in tests
if self.cluster.is_some() {
debug!("Cluster is already set in the endpoint spec, using it");
spec.cluster = self.cluster.clone().unwrap();
debug!("spec.cluster {:?}", spec.cluster);
// fill missing fields again
if create_test_user {
spec.cluster.roles.push(Role {
// Create spec file
let mut spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
disk_quota_bytes: None,
disable_lfc_resizing: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
});
spec.cluster.databases.push(Database {
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
});
}
spec.cluster.postgresql_conf = Some(postgresql_conf);
}
ComputeConfig {
spec: Some(spec),
compute_ctl_config: self.compute_ctl_config.clone(),
}
}]
} else {
Vec::new()
},
settings: None,
postgresql_conf: Some(postgresql_conf.clone()),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
logs_export_host: None::<String>,
};
let config_path = self.endpoint_path().join("config.json");
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
// this strange code is needed to support respec() in tests
if self.cluster.is_some() {
debug!("Cluster is already set in the endpoint spec, using it");
spec.cluster = self.cluster.clone().unwrap();
debug!("spec.cluster {:?}", spec.cluster);
// fill missing fields again
if create_test_user {
spec.cluster.roles.push(Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
});
spec.cluster.databases.push(Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
});
}
spec.cluster.postgresql_conf = Some(postgresql_conf);
}
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
let logfile = std::fs::OpenOptions::new()
@@ -793,8 +725,10 @@ impl Endpoint {
])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.arg("--config")
.arg(self.endpoint_path().join("config.json").as_os_str())
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
@@ -805,7 +739,16 @@ impl Endpoint {
])
// TODO: It would be nice if we generated compute IDs with the same
// algorithm as the real control plane.
.args(["--compute-id", &self.endpoint_id])
.args([
"--compute-id",
&format!(
"compute-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs()
),
])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
@@ -903,7 +846,6 @@ impl Endpoint {
self.external_http_address.port()
),
)
.bearer_auth(self.generate_jwt()?)
.send()
.await?;
@@ -928,12 +870,10 @@ impl Endpoint {
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
) -> Result<()> {
let (mut spec, compute_ctl_config) = {
let config_path = self.endpoint_path().join("config.json");
let file = std::fs::File::open(config_path)?;
let config: ComputeConfig = serde_json::from_reader(file)?;
(config.spec.unwrap(), config.compute_ctl_config)
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
serde_json::from_reader(file)?
};
let postgresql_conf = self.read_postgresql_conf()?;
@@ -980,11 +920,10 @@ impl Endpoint {
self.external_http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.bearer_auth(self.generate_jwt()?)
.body(
serde_json::to_string(&ConfigurationRequest {
spec,
compute_ctl_config,
compute_ctl_config: ComputeCtlConfig::default(),
})
.unwrap(),
)

View File

@@ -9,8 +9,8 @@
mod background_process;
pub mod broker;
pub mod endpoint;
pub mod endpoint_storage;
pub mod local_env;
pub mod object_storage;
pub mod pageserver;
pub mod postgresql_conf;
pub mod safekeeper;

View File

@@ -12,18 +12,17 @@ use std::{env, fs};
use anyhow::{Context, bail};
use clap::ValueEnum;
use pem::Pem;
use postgres_backend::AuthType;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use utils::auth::encode_from_key_file;
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 17;
pub const DEFAULT_PG_VERSION: u32 = 16;
//
// This data structures represents neon_local CLI config
@@ -57,7 +56,6 @@ pub struct LocalEnv {
// used to issue tokens during e.g pg start
pub private_key_path: PathBuf,
/// Path to environment's public key
pub public_key_path: PathBuf,
pub broker: NeonBroker,
@@ -72,7 +70,7 @@ pub struct LocalEnv {
pub safekeepers: Vec<SafekeeperConf>,
pub endpoint_storage: EndpointStorageConf,
pub object_storage: ObjectStorageConf,
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
// be propagated into each pageserver's configuration.
@@ -110,7 +108,7 @@ pub struct OnDiskConfig {
)]
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub endpoint_storage: EndpointStorageConf,
pub object_storage: ObjectStorageConf,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
@@ -144,7 +142,7 @@ pub struct NeonLocalInitConf {
pub storage_controller: Option<NeonStorageControllerConf>,
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub endpoint_storage: EndpointStorageConf,
pub object_storage: ObjectStorageConf,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
@@ -152,7 +150,7 @@ pub struct NeonLocalInitConf {
#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct EndpointStorageConf {
pub struct ObjectStorageConf {
pub port: u16,
}
@@ -413,8 +411,8 @@ impl LocalEnv {
self.pg_dir(pg_version, "lib")
}
pub fn endpoint_storage_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("endpoint_storage")
pub fn object_storage_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("object_storage")
}
pub fn pageserver_bin(&self) -> PathBuf {
@@ -450,8 +448,8 @@ impl LocalEnv {
self.base_data_dir.join("safekeepers").join(data_dir_name)
}
pub fn endpoint_storage_data_dir(&self) -> PathBuf {
self.base_data_dir.join("endpoint_storage")
pub fn object_storage_data_dir(&self) -> PathBuf {
self.base_data_dir.join("object_storage")
}
pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
@@ -615,7 +613,7 @@ impl LocalEnv {
control_plane_compute_hook_api: _,
branch_name_mappings,
generate_local_ssl_certs,
endpoint_storage,
object_storage,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -632,7 +630,7 @@ impl LocalEnv {
control_plane_hooks_api,
branch_name_mappings,
generate_local_ssl_certs,
endpoint_storage,
object_storage,
}
};
@@ -742,7 +740,7 @@ impl LocalEnv {
control_plane_compute_hook_api: None,
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
endpoint_storage: self.endpoint_storage.clone(),
object_storage: self.object_storage.clone(),
},
)
}
@@ -760,11 +758,11 @@ impl LocalEnv {
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
let key = self.read_private_key()?;
encode_from_key_file(claims, &key)
let private_key_path = self.get_private_key_path();
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
/// Get the path to the private key.
pub fn get_private_key_path(&self) -> PathBuf {
if self.private_key_path.is_absolute() {
self.private_key_path.to_path_buf()
@@ -773,29 +771,6 @@ impl LocalEnv {
}
}
/// Get the path to the public key.
pub fn get_public_key_path(&self) -> PathBuf {
if self.public_key_path.is_absolute() {
self.public_key_path.to_path_buf()
} else {
self.base_data_dir.join(&self.public_key_path)
}
}
/// Read the contents of the private key file.
pub fn read_private_key(&self) -> anyhow::Result<Pem> {
let private_key_path = self.get_private_key_path();
let pem = pem::parse(fs::read(private_key_path)?)?;
Ok(pem)
}
/// Read the contents of the public key file.
pub fn read_public_key(&self) -> anyhow::Result<Pem> {
let public_key_path = self.get_public_key_path();
let pem = pem::parse(fs::read(public_key_path)?)?;
Ok(pem)
}
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
let base_path = base_path();
@@ -849,7 +824,7 @@ impl LocalEnv {
control_plane_api,
generate_local_ssl_certs,
control_plane_hooks_api,
endpoint_storage,
object_storage,
} = conf;
// Find postgres binaries.
@@ -901,7 +876,7 @@ impl LocalEnv {
control_plane_hooks_api,
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
endpoint_storage,
object_storage,
};
if generate_local_ssl_certs {
@@ -929,13 +904,13 @@ impl LocalEnv {
.context("pageserver init failed")?;
}
EndpointStorage::from_env(&env)
ObjectStorage::from_env(&env)
.init()
.context("object storage init failed")?;
// setup remote remote location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
std::fs::create_dir_all(env.base_data_dir.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR))?;
std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
env.persist_config()
}
@@ -981,7 +956,6 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Extract the public key from the private key file
//
// openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
@@ -998,7 +972,6 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
String::from_utf8_lossy(&keygen_output.stderr)
);
}
Ok(())
}
@@ -1007,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
@@ -1037,7 +1010,7 @@ fn generate_ssl_cert(
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "ed25519"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])

View File

@@ -1,33 +1,34 @@
use crate::background_process::{self, start_process, stop_process};
use crate::local_env::LocalEnv;
use anyhow::anyhow;
use anyhow::{Context, Result};
use camino::Utf8PathBuf;
use std::io::Write;
use std::time::Duration;
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;
pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
pub struct EndpointStorage {
pub struct ObjectStorage {
pub bin: Utf8PathBuf,
pub data_dir: Utf8PathBuf,
pub pemfile: Utf8PathBuf,
pub port: u16,
}
impl EndpointStorage {
pub fn from_env(env: &LocalEnv) -> EndpointStorage {
EndpointStorage {
bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
impl ObjectStorage {
pub fn from_env(env: &LocalEnv) -> ObjectStorage {
ObjectStorage {
bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
port: env.endpoint_storage.port,
port: env.object_storage.port,
}
}
fn config_path(&self) -> Utf8PathBuf {
self.data_dir.join("endpoint_storage.json")
self.data_dir.join("object_storage.json")
}
fn listen_addr(&self) -> Utf8PathBuf {
@@ -48,7 +49,7 @@ impl EndpointStorage {
let cfg = Cfg {
listen: self.listen_addr(),
pemfile: parent.join(self.pemfile.clone()),
local_path: parent.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR),
local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
r#type: "LocalFs".to_string(),
};
std::fs::create_dir_all(self.config_path().parent().unwrap())?;
@@ -58,19 +59,24 @@ impl EndpointStorage {
}
pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
println!("Starting endpoint_storage at {}", self.listen_addr());
println!("Starting s3 proxy at {}", self.listen_addr());
std::io::stdout().flush().context("flush stdout")?;
let process_status_check = || async {
let res = reqwest::Client::new().get(format!("http://{}/metrics", self.listen_addr()));
match res.send().await {
Ok(res) => Ok(res.status().is_success()),
Err(_) => Ok(false),
tokio::time::sleep(Duration::from_millis(500)).await;
let res = reqwest::Client::new()
.get(format!("http://{}/metrics", self.listen_addr()))
.send()
.await;
match res {
Ok(response) if response.status().is_success() => Ok(true),
Ok(_) => Err(anyhow!("Failed to query /metrics")),
Err(e) => Err(anyhow!("Failed to check node status: {e}")),
}
};
let res = start_process(
"endpoint_storage",
"object_storage",
&self.data_dir.clone().into_std_path_buf(),
&self.bin.clone().into_std_path_buf(),
vec![self.config_path().to_string()],
@@ -88,14 +94,14 @@ impl EndpointStorage {
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
stop_process(immediate, "endpoint_storage", &self.pid_file())
stop_process(immediate, "object_storage", &self.pid_file())
}
fn log_file(&self) -> Utf8PathBuf {
self.data_dir.join("endpoint_storage.log")
self.data_dir.join("object_storage.log")
}
fn pid_file(&self) -> Utf8PathBuf {
self.data_dir.join("endpoint_storage.pid")
self.data_dir.join("object_storage.pid")
}
}

View File

@@ -318,7 +318,7 @@ impl PageServerNode {
self.conf.id, datadir,
)
})?;
let args = vec!["-D", datadir_path_str];
let args = vec!["-D", datadir_path_str, "--dev"];
background_process::start_process(
"pageserver",
@@ -413,11 +413,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
compaction_shard_ancestor: settings
.remove("compaction_shard_ancestor")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
compaction_l0_first: settings
.remove("compaction_l0_first")
.map(|x| x.parse::<bool>())
@@ -540,11 +535,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
gc_compaction_verification: settings
.remove("gc_compaction_verification")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_verification' as bool")?,
gc_compaction_initial_threshold_kb: settings
.remove("gc_compaction_initial_threshold_kb")
.map(|x| x.parse::<u64>())

View File

@@ -162,6 +162,7 @@ impl SafekeeperNode {
listen_http,
"--availability-zone".to_owned(),
availability_zone,
"--dev".to_owned(),
];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);

View File

@@ -13,12 +13,9 @@ use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
};
use pageserver_api::models::{
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use pem::Pem;
use postgres_backend::AuthType;
use reqwest::{Certificate, Method};
use serde::de::DeserializeOwned;
@@ -35,8 +32,8 @@ use crate::local_env::{LocalEnv, NeonStorageControllerConf};
pub struct StorageController {
env: LocalEnv,
private_key: Option<Pem>,
public_key: Option<Pem>,
private_key: Option<Vec<u8>>,
public_key: Option<String>,
client: reqwest::Client,
config: NeonStorageControllerConf,
@@ -85,8 +82,7 @@ impl NeonStorageControllerStopArgs {
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: Option<NodeId>,
pub generation_override: Option<i32>, // only new tenants
pub config: Option<TenantConfig>, // only new tenants
pub generation_override: Option<i32>,
}
#[derive(Serialize, Deserialize)]
@@ -117,9 +113,7 @@ impl StorageController {
AuthType::Trust => (None, None),
AuthType::NeonJWT => {
let private_key_path = env.get_private_key_path();
let private_key =
pem::parse(fs::read(private_key_path).expect("failed to read private key"))
.expect("failed to parse PEM file");
let private_key = fs::read(private_key_path).expect("failed to read private key");
// If pageserver auth is enabled, this implicitly enables auth for this service,
// using the same credentials.
@@ -141,13 +135,9 @@ impl StorageController {
.expect("Empty key dir")
.expect("Error reading key dir");
pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key"))
.expect("Failed to parse PEM file")
std::fs::read_to_string(dent.path()).expect("Can't read public key")
} else {
pem::parse(
std::fs::read_to_string(&public_key_path).expect("Can't read public key"),
)
.expect("Failed to parse PEM file")
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
};
(Some(private_key), Some(public_key))
}
@@ -815,7 +805,6 @@ impl StorageController {
tenant_shard_id,
node_id: Some(pageserver_id),
generation_override: None,
config: None,
};
let response = self

View File

@@ -45,7 +45,9 @@ allow = [
"ISC",
"MIT",
"MPL-2.0",
"OpenSSL",
"Unicode-3.0",
"Zlib",
]
confidence-threshold = 0.8
exceptions = [
@@ -54,6 +56,14 @@ exceptions = [
{ allow = ["Zlib"], name = "const_format", version = "*" },
]
[[licenses.clarify]]
name = "ring"
version = "*"
expression = "MIT AND ISC AND OpenSSL"
license-files = [
{ path = "LICENSE", hash = 0xbd0eed23 }
]
[licenses.private]
ignore = true
registries = []
@@ -106,11 +116,7 @@ name = "openssl"
unknown-registry = "warn"
unknown-git = "warn"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
allow-git = [
# Crate pinned to commit in origin repo due to opentelemetry version.
# TODO: Remove this once crate is fetched from crates.io again.
"https://github.com/mattiapenati/tower-otel",
]
allow-git = []
[sources.allow-org]
github = [

View File

@@ -1,3 +1,4 @@
# Example docker compose configuration
The configuration in this directory is used for testing Neon docker images: it is
@@ -7,13 +8,3 @@ you can experiment with a miniature Neon system, use `cargo neon` rather than co
This configuration does not start the storage controller, because the controller
needs a way to reconfigure running computes, and no such thing exists in this setup.
## Generating the JWKS for a compute
```shell
openssl genpkey -algorithm Ed25519 -out private-key.pem
openssl pkey -in private-key.pem -pubout -out public-key.pem
openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der
key="$(xxd -plain -cols 32 -s -32 public-key.der)"
key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)"
x="$(printf '%s' "$key" | basenc --base64url --wrap=0)"
```

View File

@@ -1,3 +0,0 @@
-----BEGIN PRIVATE KEY-----
MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma
-----END PRIVATE KEY-----

View File

@@ -1,3 +0,0 @@
-----BEGIN PUBLIC KEY-----
MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0=
-----END PUBLIC KEY-----

View File

@@ -11,8 +11,8 @@ generate_id() {
PG_VERSION=${PG_VERSION:-14}
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
CONFIG_FILE=/tmp/config.json
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
SPEC_FILE=/tmp/spec.json
echo "Waiting pageserver become ready."
while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
done
echo "Page server is ready."
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
tenant_id=${TENANT_ID}
@@ -73,17 +73,17 @@ else
ulid_extension=ulid
fi
echo "Adding pgx_ulid"
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
echo "Overwrite tenant id and timeline id in spec file"
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
cat ${CONFIG_FILE}
cat ${SPEC_FILE}
echo "Start compute node"
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
-C "postgresql://cloud_admin@localhost:55433/postgres" \
-b /usr/local/bin/postgres \
--compute-id "compute-$RANDOM" \
--config "$CONFIG_FILE"
-S ${SPEC_FILE}

View File

@@ -1,160 +0,0 @@
{
"spec": {
"format_version": 1.0,
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"cluster": {
"cluster_id": "docker_compose",
"name": "docker_compose_test",
"state": "restarted",
"roles": [
{
"name": "cloud_admin",
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
"options": null
}
],
"databases": [
],
"settings": [
{
"name": "fsync",
"value": "off",
"vartype": "bool"
},
{
"name": "wal_level",
"value": "logical",
"vartype": "enum"
},
{
"name": "wal_log_hints",
"value": "on",
"vartype": "bool"
},
{
"name": "log_connections",
"value": "on",
"vartype": "bool"
},
{
"name": "port",
"value": "55433",
"vartype": "integer"
},
{
"name": "shared_buffers",
"value": "1MB",
"vartype": "string"
},
{
"name": "max_connections",
"value": "100",
"vartype": "integer"
},
{
"name": "listen_addresses",
"value": "0.0.0.0",
"vartype": "string"
},
{
"name": "max_wal_senders",
"value": "10",
"vartype": "integer"
},
{
"name": "max_replication_slots",
"value": "10",
"vartype": "integer"
},
{
"name": "wal_sender_timeout",
"value": "5s",
"vartype": "string"
},
{
"name": "wal_keep_size",
"value": "0",
"vartype": "integer"
},
{
"name": "password_encryption",
"value": "md5",
"vartype": "enum"
},
{
"name": "restart_after_crash",
"value": "off",
"vartype": "bool"
},
{
"name": "synchronous_standby_names",
"value": "walproposer",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
"vartype": "string"
},
{
"name": "neon.safekeepers",
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
"vartype": "string"
},
{
"name": "neon.timeline_id",
"value": "TIMELINE_ID",
"vartype": "string"
},
{
"name": "neon.tenant_id",
"value": "TENANT_ID",
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",
"vartype": "string"
},
{
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
},
{
"name": "cron.database",
"value": "postgres",
"vartype": "string"
}
]
},
"delta_operations": [
]
},
"compute_ctl_config": {
"jwks": {
"keys": [
{
"use": "sig",
"key_ops": [
"verify"
],
"alg": "EdDSA",
"kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=",
"kty": "OKP",
"crv": "Ed25519",
"x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo="
}
]
}
}
}

View File

@@ -0,0 +1,141 @@
{
"format_version": 1.0,
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"cluster": {
"cluster_id": "docker_compose",
"name": "docker_compose_test",
"state": "restarted",
"roles": [
{
"name": "cloud_admin",
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
"options": null
}
],
"databases": [
],
"settings": [
{
"name": "fsync",
"value": "off",
"vartype": "bool"
},
{
"name": "wal_level",
"value": "logical",
"vartype": "enum"
},
{
"name": "wal_log_hints",
"value": "on",
"vartype": "bool"
},
{
"name": "log_connections",
"value": "on",
"vartype": "bool"
},
{
"name": "port",
"value": "55433",
"vartype": "integer"
},
{
"name": "shared_buffers",
"value": "1MB",
"vartype": "string"
},
{
"name": "max_connections",
"value": "100",
"vartype": "integer"
},
{
"name": "listen_addresses",
"value": "0.0.0.0",
"vartype": "string"
},
{
"name": "max_wal_senders",
"value": "10",
"vartype": "integer"
},
{
"name": "max_replication_slots",
"value": "10",
"vartype": "integer"
},
{
"name": "wal_sender_timeout",
"value": "5s",
"vartype": "string"
},
{
"name": "wal_keep_size",
"value": "0",
"vartype": "integer"
},
{
"name": "password_encryption",
"value": "md5",
"vartype": "enum"
},
{
"name": "restart_after_crash",
"value": "off",
"vartype": "bool"
},
{
"name": "synchronous_standby_names",
"value": "walproposer",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
"vartype": "string"
},
{
"name": "neon.safekeepers",
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
"vartype": "string"
},
{
"name": "neon.timeline_id",
"value": "TIMELINE_ID",
"vartype": "string"
},
{
"name": "neon.tenant_id",
"value": "TENANT_ID",
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",
"vartype": "string"
},
{
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
},
{
"name": "cron.database",
"value": "postgres",
"vartype": "string"
}
]
},
"delta_operations": [
]
}

View File

@@ -159,7 +159,7 @@ services:
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
- ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute_wrapper/shell/:/shell/
ports:
- 55433:55433 # pg protocol handler

View File

@@ -1,8 +0,0 @@
EXTENSION = pg_jsonschema
DATA = pg_jsonschema--1.0.sql
REGRESS = jsonschema_valid_api jsonschema_edge_cases
REGRESS_OPTS = --load-extension=pg_jsonschema
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -1,87 +0,0 @@
-- Schema with enums, nulls, extra properties disallowed
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json);
jsonschema_is_valid
---------------------
t
(1 row)
-- Valid enum and null email
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": null}'::json
);
jsonschema_validation_errors
------------------------------
{}
(1 row)
-- Invalid enum value
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "disabled", "email": null}'::json
);
jsonschema_validation_errors
----------------------------------------------------------------------
{"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
(1 row)
-- Invalid email format (assuming format is validated)
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": "not-an-email"}'::json
);
jsonschema_validation_errors
-----------------------------------------
{"\"not-an-email\" is not a \"email\""}
(1 row)
-- Extra property not allowed
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "extra": "should not be here"}'::json
);
jsonschema_validation_errors
--------------------------------------------------------------------
{"Additional properties are not allowed ('extra' was unexpected)"}
(1 row)

View File

@@ -1,65 +0,0 @@
-- Define schema
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json);
jsonschema_is_valid
---------------------
t
(1 row)
-- Valid instance
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "alice", "age": 25}'::json
);
jsonschema_validation_errors
------------------------------
{}
(1 row)
-- Invalid instance: missing required "username"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"age": 25}'::json
);
jsonschema_validation_errors
-----------------------------------------
{"\"username\" is a required property"}
(1 row)
-- Invalid instance: wrong type for "age"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "bob", "age": "twenty"}'::json
);
jsonschema_validation_errors
-------------------------------------------
{"\"twenty\" is not of type \"integer\""}
(1 row)

View File

@@ -1,66 +0,0 @@
-- Schema with enums, nulls, extra properties disallowed
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json);
-- Valid enum and null email
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": null}'::json
);
-- Invalid enum value
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "disabled", "email": null}'::json
);
-- Invalid email format (assuming format is validated)
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": "not-an-email"}'::json
);
-- Extra property not allowed
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "extra": "should not be here"}'::json
);

View File

@@ -1,48 +0,0 @@
-- Define schema
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json);
-- Valid instance
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "alice", "age": 25}'::json
);
-- Invalid instance: missing required "username"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"age": 25}'::json
);
-- Invalid instance: wrong type for "age"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "bob", "age": "twenty"}'::json
);

View File

@@ -1,9 +0,0 @@
EXTENSION = pg_session_jwt
REGRESS = basic_functions
REGRESS_OPTS = --load-extension=$(EXTENSION)
export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -1,35 +0,0 @@
-- Basic functionality tests for pg_session_jwt
-- Test auth.init() function
SELECT auth.init();
init
------
(1 row)
-- Test an invalid JWT
SELECT auth.jwt_session_init('INVALID-JWT');
ERROR: invalid JWT encoding
-- Test creating a session with an expired JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
ERROR: Token used after it has expired
-- Test creating a session with a valid JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
jwt_session_init
------------------
(1 row)
-- Test auth.session() function
SELECT auth.session();
session
-------------------------------------------------------------------------
{"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
(1 row)
-- Test auth.user_id() function
SELECT auth.user_id() AS user_id;
user_id
---------
user123
(1 row)

View File

@@ -1,19 +0,0 @@
-- Basic functionality tests for pg_session_jwt
-- Test auth.init() function
SELECT auth.init();
-- Test an invalid JWT
SELECT auth.jwt_session_init('INVALID-JWT');
-- Test creating a session with an expired JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
-- Test creating a session with a valid JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
-- Test auth.session() function
SELECT auth.session();
-- Test auth.user_id() function
SELECT auth.user_id() AS user_id;

View File

@@ -14,32 +14,6 @@ pub struct GenericAPIError {
pub error: String,
}
/// All configuration parameters necessary for a compute. When
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
/// and contains parameters necessary for operating `compute_ctl` independently
/// of whether a tenant is attached to the compute or not.
///
/// This also happens to be the body of `compute_ctl`'s /configure request.
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeConfig {
/// The compute spec
pub spec: Option<ComputeSpec>,
/// The compute_ctl configuration
#[allow(dead_code)]
pub compute_ctl_config: ComputeCtlConfig,
}
impl From<ControlPlaneConfigResponse> for ComputeConfig {
fn from(value: ControlPlaneConfigResponse) -> Self {
Self {
spec: value.spec,
compute_ctl_config: value.compute_ctl_config,
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ExtensionInstallResponse {
pub extension: PgIdent,
@@ -160,7 +134,7 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
@@ -179,7 +153,7 @@ impl Default for ComputeCtlConfig {
}
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct TlsConfig {
pub key_path: String,
pub cert_path: String,
@@ -187,7 +161,7 @@ pub struct TlsConfig {
/// Response of the `/computes/{compute_id}/spec` control-plane API.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneConfigResponse {
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
pub compute_ctl_config: ComputeCtlConfig,

View File

@@ -1,8 +1,8 @@
//! The ComputeSpec contains all the information needed to start up
//! the right version of PostgreSQL, and connect it to the storage nodes.
//! It can be passed as part of the `config.json`, or the control plane can
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
//! compute_ctl can fetch it by calling the control plane's API.
//! `ComputeSpec` represents the contents of the spec.json file.
//!
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
//! all the information needed to start up the right version of PostgreSQL,
//! and connect it to the storage nodes.
use std::collections::HashMap;
use indexmap::IndexMap;
@@ -104,12 +104,6 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
// More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs.
pub project_id: Option<String>,
pub branch_id: Option<String>,
pub endpoint_id: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
@@ -165,7 +159,13 @@ pub struct ComputeSpec {
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for compute audit logging
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
#[serde(default)]
pub audit_log_level: ComputeAudit,
@@ -242,22 +242,13 @@ impl RemoteExtSpec {
match self.extension_data.get(real_ext_name) {
Some(_ext_data) => {
// We have decided to use the Go naming convention due to Kubernetes.
let arch = match std::env::consts::ARCH {
"x86_64" => "amd64",
"aarch64" => "arm64",
arch => arch,
};
// Construct the path to the extension archive
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
//
// Keep it in sync with path generation in
// https://github.com/neondatabase/build-custom-extensions/tree/main
let archive_path_str = format!(
"{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
);
let archive_path_str =
format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
Ok((
real_ext_name.to_string(),
RemotePath::from_string(&archive_path_str)?,
@@ -298,25 +289,14 @@ impl ComputeMode {
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
// Deprecated, use Base instead
Log,
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
// logged to the standard postgresql log stream
Base,
// Deprecated, use Full or Extended instead
Hipaa,
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access
Extended,
// (pgaudit.log='all', pgaudit.log_parameter='on'),
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access.
Full,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]

View File

@@ -14,7 +14,6 @@ futures.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
jsonwebtoken.workspace = true
once_cell.workspace = true
pprof.workspace = true
regex.workspace = true
@@ -31,7 +30,6 @@ tokio.workspace = true
tracing.workspace = true
url.workspace = true
uuid.workspace = true
x509-cert.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.

View File

@@ -8,7 +8,6 @@ use bytes::{Bytes, BytesMut};
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
use hyper::http::HeaderValue;
use hyper::{Body, Method, Request, Response};
use jsonwebtoken::TokenData;
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
use once_cell::sync::Lazy;
use pprof::ProfilerGuardBuilder;
@@ -619,7 +618,7 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
})?;
let token = parse_token(header_value)?;
let data: TokenData<Claims> = auth.decode(token).map_err(|err| {
let data = auth.decode(token).map_err(|err| {
warn!("Authentication error: {err}");
// Rely on From<AuthError> for ApiError impl
err

View File

@@ -4,8 +4,6 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use metrics::{IntCounterVec, register_int_counter_vec};
use once_cell::sync::Lazy;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
@@ -28,24 +26,6 @@ pub struct Server {
tls_acceptor: Option<TlsAcceptor>,
}
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_started_total",
"Number of established http/https connections",
&["scheme"]
)
.expect("failed to define a metric")
});
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_errors_total",
"Number of occured connection errors by type",
&["type"]
)
.expect("failed to define a metric")
});
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -80,15 +60,6 @@ impl Server {
false
}
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
@@ -96,7 +67,6 @@ impl Server {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
tcp_error_cnt.inc();
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
@@ -108,18 +78,11 @@ impl Server {
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
let tls_error_cnt = tls_error_cnt.clone();
let http_error_cnt = http_error_cnt.clone();
let https_error_cnt = https_error_cnt.clone();
let http_connection_cnt = http_connection_cnt.clone();
let https_connection_cnt = https_connection_cnt.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
https_connection_cnt.inc();
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
@@ -127,7 +90,6 @@ impl Server {
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
tls_error_cnt.inc();
if !suppress_io_error(&err) {
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
}
@@ -135,7 +97,6 @@ impl Server {
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
https_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
}
@@ -143,9 +104,7 @@ impl Server {
}
None => {
// Handle HTTP connection.
http_connection_cnt.inc();
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
http_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
}
@@ -156,7 +115,6 @@ impl Server {
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}
@@ -164,7 +122,6 @@ impl Server {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}

View File

@@ -3,14 +3,11 @@ use std::{sync::Arc, time::Duration};
use anyhow::Context;
use arc_swap::ArcSwap;
use camino::Utf8Path;
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
use rustls::{
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
pki_types::{CertificateDer, PrivateKeyDer},
server::{ClientHello, ResolvesServerCert},
sign::CertifiedKey,
};
use x509_cert::der::Reader;
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let cert_data = tokio::fs::read(filename)
@@ -56,76 +53,6 @@ pub async fn load_certified_key(
Ok(certified_key)
}
/// rustls's CertifiedKey with extra parsed fields used for metrics.
struct ParsedCertifiedKey {
certified_key: CertifiedKey,
expiration_time: UnixTime,
}
/// Parse expiration time from an X509 certificate.
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
let parsed_cert = x509_cert::der::SliceReader::new(cert)
.context("Failed to parse cerficiate")?
.decode::<x509_cert::Certificate>()
.context("Failed to parse cerficiate")?;
Ok(UnixTime::since_unix_epoch(
parsed_cert
.tbs_certificate
.validity
.not_after
.to_unix_duration(),
))
}
async fn load_and_parse_certified_key(
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
) -> anyhow::Result<ParsedCertifiedKey> {
let certified_key = load_certified_key(key_filename, cert_filename).await?;
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
Ok(ParsedCertifiedKey {
certified_key,
expiration_time,
})
}
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"tls_certs_expiration_time_seconds",
"Expiration time of the loaded certificate since unix epoch in seconds",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_started_total",
"Number of certificate reload loop iterations started",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_updated_total",
"Number of times the certificate was updated to the new one",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_failed_total",
"Number of times the certificate reload failed",
&["resolver_name"]
)
.expect("failed to define a metric")
});
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
/// the disk periodically.
#[derive(Debug)]
@@ -136,28 +63,16 @@ pub struct ReloadingCertificateResolver {
impl ReloadingCertificateResolver {
/// Creates a new Resolver by loading certificate and private key from FS and
/// creating tokio::task to reload them with provided reload_period.
/// resolver_name is used as metric's label.
pub async fn new(
resolver_name: &str,
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
reload_period: Duration,
) -> anyhow::Result<Arc<Self>> {
// Create metrics for current resolver.
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
let cert_reload_started_counter =
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_updated_counter =
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_failed_counter =
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
let this = Arc::new(Self {
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
certified_key: ArcSwap::from_pointee(
load_certified_key(key_filename, cert_filename).await?,
),
});
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
tokio::spawn({
let weak_this = Arc::downgrade(&this);
@@ -173,22 +88,17 @@ impl ReloadingCertificateResolver {
Some(this) => this,
None => break, // Resolver has been destroyed, exit.
};
cert_reload_started_counter.inc();
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
Ok(parsed_key) => {
if parsed_key.certified_key.cert == this.certified_key.load().cert {
match load_certified_key(&key_filename, &cert_filename).await {
Ok(new_certified_key) => {
if new_certified_key.cert == this.certified_key.load().cert {
tracing::debug!("Certificate has not changed since last reloading");
} else {
tracing::info!("Certificate has been reloaded");
this.certified_key.store(Arc::new(parsed_key.certified_key));
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
cert_reload_updated_counter.inc();
this.certified_key.store(Arc::new(new_certified_key));
}
last_reload_failed = false;
}
Err(err) => {
cert_reload_failed_counter.inc();
// Note: Reloading certs may fail if it conflicts with the script updating
// the files at the same time. Warn only if the error is persistent.
if last_reload_failed {

View File

@@ -35,7 +35,6 @@ nix = {workspace = true, optional = true}
reqwest.workspace = true
rand.workspace = true
tracing-utils.workspace = true
once_cell.workspace = true
[dev-dependencies]
bincode.workspace = true

View File

@@ -180,8 +180,6 @@ pub struct ConfigToml {
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
pub tracing: Option<Tracing>,
pub enable_tls_page_service_api: bool,
pub dev_mode: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -208,10 +206,6 @@ pub struct PageServicePipeliningConfigPipelined {
/// Causes runtime errors if larger than max get_vectored batch size.
pub max_batch_size: NonZeroUsize,
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
// The default below is such that new versions of the software can start
// with the old configuration.
#[serde(default)]
pub batching: PageServiceProtocolPipelinedBatchingStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -221,19 +215,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
Tasks,
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum PageServiceProtocolPipelinedBatchingStrategy {
/// All get page requests in a batch will be at the same LSN
#[default]
UniformLsn,
/// Get page requests in a batch may be at different LSN
///
/// One key cannot be present more than once at different LSNs in
/// the same batch.
ScatteredLsn,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case")]
pub enum GetVectoredConcurrentIo {
@@ -380,8 +361,6 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, enable shard ancestor compaction (enabled by default).
pub compaction_shard_ancestor: bool,
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
pub compaction_l0_first: bool,
@@ -472,8 +451,6 @@ pub struct TenantConfigToml {
// gc-compaction related configs
/// Enable automatic gc-compaction trigger on this tenant.
pub gc_compaction_enabled: bool,
/// Enable verification of gc-compaction results.
pub gc_compaction_verification: bool,
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
/// gc-compaction will be triggered.
pub gc_compaction_initial_threshold_kb: u64,
@@ -635,12 +612,9 @@ impl Default for ConfigToml {
page_service_pipelining: if !cfg!(test) {
PageServicePipeliningConfig::Serial
} else {
// Do not turn this into the default until scattered reads have been
// validated and rolled-out fully.
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
max_batch_size: NonZeroUsize::new(32).unwrap(),
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
})
},
get_vectored_concurrent_io: if !cfg!(test) {
@@ -657,8 +631,6 @@ impl Default for ConfigToml {
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
tracing: None,
enable_tls_page_service_api: false,
dev_mode: false,
}
}
}
@@ -681,13 +653,12 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
// This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
// 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
// DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
// compaction usage of 15360MB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
// 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
// be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
// with this config, we can get a maximum peak compaction usage of 9 GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
// Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
// read amp.
pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -704,11 +675,8 @@ pub mod tenant_conf_defaults {
// Relevant: https://github.com/neondatabase/neon/issues/3394
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
// Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
// without looking at the exact number of L0 layers.
// It was expected to have the following behavior:
// > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// > layer creation will end immediately. Set to 0 to disable.
// If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// layer creation will end immediately. Set to 0 to disable.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
@@ -722,7 +690,6 @@ pub mod tenant_conf_defaults {
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
}
@@ -742,7 +709,6 @@ impl Default for TenantConfigToml {
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
l0_flush_delay_threshold: None,
@@ -778,7 +744,6 @@ impl Default for TenantConfigToml {
wal_receiver_protocol_override: None,
rel_size_v2_enabled: false,
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
sampling_ratio: None,

View File

@@ -7,8 +7,7 @@ use std::time::{Duration, Instant};
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::id::{NodeId, TenantId};
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
use crate::shard::{ShardStripeSize, TenantShardId};
@@ -500,15 +499,6 @@ pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
/// Import request for safekeeper timelines.
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineImportRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub start_lsn: Lsn,
pub sk_set: Vec<NodeId>,
}
#[cfg(test)]
mod test {
use serde_json;

View File

@@ -927,7 +927,7 @@ impl Key {
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
#[inline(always)]
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match self.field1 {
0x00 => (
RelTag {
@@ -938,7 +938,7 @@ impl Key {
},
self.field6,
),
_ => return Err(ToRelBlockError(self.field1)),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
})
}
}
@@ -951,17 +951,6 @@ impl std::str::FromStr for Key {
}
}
#[derive(Debug)]
pub struct ToRelBlockError(u8);
impl fmt::Display for ToRelBlockError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unexpected value kind 0x{:02x}", self.0)
}
}
impl std::error::Error for ToRelBlockError {}
#[cfg(test)]
mod tests {
use std::str::FromStr;

View File

@@ -320,35 +320,6 @@ pub struct TimelineCreateRequest {
pub mode: TimelineCreateRequestMode,
}
impl TimelineCreateRequest {
pub fn mode_tag(&self) -> &'static str {
match &self.mode {
TimelineCreateRequestMode::Branch { .. } => "branch",
TimelineCreateRequestMode::ImportPgdata { .. } => "import",
TimelineCreateRequestMode::Bootstrap { .. } => "bootstrap",
}
}
pub fn is_import(&self) -> bool {
matches!(self.mode, TimelineCreateRequestMode::ImportPgdata { .. })
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub enum ShardImportStatus {
InProgress,
Done,
Error(String),
}
impl ShardImportStatus {
pub fn is_terminal(&self) -> bool {
match self {
ShardImportStatus::InProgress => false,
ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
}
}
}
/// Storage controller specific extensions to [`TimelineInfo`].
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateResponseStorcon {
@@ -555,8 +526,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_shard_ancestor: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_first: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_semaphore: FieldPatch<bool>,
@@ -607,8 +576,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_enabled: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_verification: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -646,9 +613,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_shard_ancestor: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_l0_first: Option<bool>,
@@ -732,9 +696,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_enabled: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_verification: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_initial_threshold_kb: Option<u64>,
@@ -758,7 +719,6 @@ impl TenantConfig {
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_shard_ancestor,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
@@ -784,7 +744,6 @@ impl TenantConfig {
mut wal_receiver_protocol_override,
mut rel_size_v2_enabled,
mut gc_compaction_enabled,
mut gc_compaction_verification,
mut gc_compaction_initial_threshold_kb,
mut gc_compaction_ratio_percent,
mut sampling_ratio,
@@ -807,9 +766,6 @@ impl TenantConfig {
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch
.compaction_shard_ancestor
.apply(&mut compaction_shard_ancestor);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
@@ -879,9 +835,6 @@ impl TenantConfig {
patch
.gc_compaction_enabled
.apply(&mut gc_compaction_enabled);
patch
.gc_compaction_verification
.apply(&mut gc_compaction_verification);
patch
.gc_compaction_initial_threshold_kb
.apply(&mut gc_compaction_initial_threshold_kb);
@@ -898,7 +851,6 @@ impl TenantConfig {
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
compaction_shard_ancestor,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
@@ -924,7 +876,6 @@ impl TenantConfig {
wal_receiver_protocol_override,
rel_size_v2_enabled,
gc_compaction_enabled,
gc_compaction_verification,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
sampling_ratio,
@@ -959,9 +910,6 @@ impl TenantConfig {
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
compaction_shard_ancestor: self
.compaction_shard_ancestor
.unwrap_or(global_conf.compaction_shard_ancestor),
compaction_l0_first: self
.compaction_l0_first
.unwrap_or(global_conf.compaction_l0_first),
@@ -1026,9 +974,6 @@ impl TenantConfig {
gc_compaction_enabled: self
.gc_compaction_enabled
.unwrap_or(global_conf.gc_compaction_enabled),
gc_compaction_verification: self
.gc_compaction_verification
.unwrap_or(global_conf.gc_compaction_verification),
gc_compaction_initial_threshold_kb: self
.gc_compaction_initial_threshold_kb
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
@@ -1846,34 +1791,8 @@ pub mod virtual_file {
}
impl IoMode {
pub fn preferred() -> Self {
// The default behavior when running Rust unit tests without any further
// flags is to use the newest behavior if available on the platform (Direct).
// The CI uses the following environment variable to unit tests for all
// different modes.
// NB: the Python regression & perf tests have their own defaults management
// that writes pageserver.toml; they do not use this variable.
if cfg!(test) {
use once_cell::sync::Lazy;
static CACHED: Lazy<IoMode> = Lazy::new(|| {
utils::env::var_serde_json_string(
"NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
)
.unwrap_or({
#[cfg(target_os = "linux")]
{
IoMode::Direct
}
#[cfg(not(target_os = "linux"))]
{
IoMode::Buffered
}
})
});
*CACHED
} else {
IoMode::Buffered
}
pub const fn preferred() -> Self {
Self::Buffered
}
}

View File

@@ -4,10 +4,10 @@
//! See docs/rfcs/025-generation-numbers.md
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TimelineId};
use utils::id::NodeId;
use crate::controller_api::NodeRegisterRequest;
use crate::models::{LocationConfigMode, ShardImportStatus};
use crate::models::LocationConfigMode;
use crate::shard::TenantShardId;
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
@@ -62,10 +62,3 @@ pub struct ValidateResponseTenant {
pub id: TenantShardId,
pub valid: bool,
}
#[derive(Serialize, Deserialize)]
pub struct PutTimelineImportStatusRequest {
pub tenant_shard_id: TenantShardId,
pub timeline_id: TimelineId,
pub status: ShardImportStatus,
}

View File

@@ -14,9 +14,8 @@ use anyhow::{Context, Result};
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
use azure_storage::StorageCredentials;
use azure_storage_blobs::blob::CopyStatus;
use azure_storage_blobs::blob::operations::GetBlobBuilder;
use azure_storage_blobs::blob::{Blob, CopyStatus};
use azure_storage_blobs::container::operations::ListBlobsBuilder;
use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
use bytes::Bytes;
use futures::FutureExt;
@@ -254,15 +253,53 @@ impl AzureBlobStorage {
download
}
fn list_streaming_for_fn<T: Default + ListingCollector>(
async fn permit(
&self,
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
let acquire = self.concurrency_limiter.acquire(kind);
tokio::select! {
permit = acquire => Ok(permit.expect("never closed")),
_ = cancel.cancelled() => Err(Cancelled),
}
}
pub fn container_name(&self) -> &str {
&self.container_name
}
}
fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
let mut res = Metadata::new();
for (k, v) in metadata.0.into_iter() {
res.insert(k, v);
}
res
}
fn to_download_error(error: azure_core::Error) -> DownloadError {
if let Some(http_err) = error.as_http_error() {
match http_err.status() {
StatusCode::NotFound => DownloadError::NotFound,
StatusCode::NotModified => DownloadError::Unmodified,
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
_ => DownloadError::Other(anyhow::Error::new(error)),
}
} else {
DownloadError::Other(error.into())
}
}
impl RemoteStorage for AzureBlobStorage {
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
request_kind: RequestKind,
customize_builder: impl Fn(ListBlobsBuilder) -> ListBlobsBuilder,
) -> impl Stream<Item = Result<T, DownloadError>> {
) -> impl Stream<Item = Result<Listing, DownloadError>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| {
self.prefix_in_container.clone().map(|mut s| {
@@ -274,7 +311,7 @@ impl AzureBlobStorage {
});
async_stream::stream! {
let _permit = self.permit(request_kind, cancel).await?;
let _permit = self.permit(RequestKind::List, cancel).await?;
let mut builder = self.client.list_blobs();
@@ -290,8 +327,6 @@ impl AzureBlobStorage {
builder = builder.max_results(MaxResults::new(limit));
}
builder = customize_builder(builder);
let mut next_marker = None;
let mut timeout_try_cnt = 1;
@@ -347,20 +382,26 @@ impl AzureBlobStorage {
break;
};
let mut res = T::default();
let mut res = Listing::default();
next_marker = entry.continuation();
let prefix_iter = entry
.blobs
.prefixes()
.map(|prefix| self.name_to_relative_path(&prefix.name));
res.add_prefixes(self, prefix_iter);
res.prefixes.extend(prefix_iter);
let blob_iter = entry
.blobs
.blobs();
.blobs()
.map(|k| ListingObject{
key: self.name_to_relative_path(&k.name),
last_modified: k.properties.last_modified.into(),
size: k.properties.content_length,
}
);
for key in blob_iter {
res.add_blob(self, key);
res.keys.push(key);
if let Some(mut mk) = max_keys {
assert!(mk > 0);
@@ -382,128 +423,6 @@ impl AzureBlobStorage {
}
}
async fn permit(
&self,
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
let acquire = self.concurrency_limiter.acquire(kind);
tokio::select! {
permit = acquire => Ok(permit.expect("never closed")),
_ = cancel.cancelled() => Err(Cancelled),
}
}
pub fn container_name(&self) -> &str {
&self.container_name
}
}
trait ListingCollector {
fn add_prefixes(&mut self, abs: &AzureBlobStorage, prefix_it: impl Iterator<Item = RemotePath>);
fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob);
}
impl ListingCollector for Listing {
fn add_prefixes(
&mut self,
_abs: &AzureBlobStorage,
prefix_it: impl Iterator<Item = RemotePath>,
) {
self.prefixes.extend(prefix_it);
}
fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
self.keys.push(ListingObject {
key: abs.name_to_relative_path(&blob.name),
last_modified: blob.properties.last_modified.into(),
size: blob.properties.content_length,
});
}
}
impl ListingCollector for crate::VersionListing {
fn add_prefixes(
&mut self,
_abs: &AzureBlobStorage,
_prefix_it: impl Iterator<Item = RemotePath>,
) {
// nothing
}
fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
let id = crate::VersionId(blob.version_id.clone().expect("didn't find version ID"));
self.versions.push(crate::Version {
key: abs.name_to_relative_path(&blob.name),
last_modified: blob.properties.last_modified.into(),
kind: crate::VersionKind::Version(id),
});
}
}
fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
let mut res = Metadata::new();
for (k, v) in metadata.0.into_iter() {
res.insert(k, v);
}
res
}
fn to_download_error(error: azure_core::Error) -> DownloadError {
if let Some(http_err) = error.as_http_error() {
match http_err.status() {
StatusCode::NotFound => DownloadError::NotFound,
StatusCode::NotModified => DownloadError::Unmodified,
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
_ => DownloadError::Other(anyhow::Error::new(error)),
}
} else {
DownloadError::Other(error.into())
}
}
impl RemoteStorage for AzureBlobStorage {
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> {
let customize_builder = |builder| builder;
let kind = RequestKind::ListVersions;
self.list_streaming_for_fn(prefix, mode, max_keys, cancel, kind, customize_builder)
}
async fn list_versions(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> std::result::Result<crate::VersionListing, DownloadError> {
let customize_builder = |mut builder: ListBlobsBuilder| {
builder = builder.include_versions(true);
builder
};
let kind = RequestKind::ListVersions;
let mut stream = std::pin::pin!(self.list_streaming_for_fn(
prefix,
mode,
max_keys,
cancel,
kind,
customize_builder
));
let mut combined: crate::VersionListing =
stream.next().await.expect("At least one item required")?;
while let Some(list) = stream.next().await {
let list = list?;
combined.versions.extend(list.versions.into_iter());
}
Ok(combined)
}
async fn head_object(
&self,
key: &RemotePath,
@@ -613,12 +532,7 @@ impl RemoteStorage for AzureBlobStorage {
let mut builder = blob_client.get();
if let Some(ref etag) = opts.etag {
builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()));
}
if let Some(ref version_id) = opts.version_id {
let version_id = azure_storage_blobs::prelude::VersionId::new(version_id.0.clone());
builder = builder.blob_versioning(version_id);
builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
}
if let Some((start, end)) = opts.byte_range() {

View File

@@ -176,32 +176,6 @@ pub struct Listing {
pub keys: Vec<ListingObject>,
}
#[derive(Default)]
pub struct VersionListing {
pub versions: Vec<Version>,
}
pub struct Version {
pub key: RemotePath,
pub last_modified: SystemTime,
pub kind: VersionKind,
}
impl Version {
pub fn version_id(&self) -> Option<&VersionId> {
match &self.kind {
VersionKind::Version(id) => Some(id),
VersionKind::DeletionMarker => None,
}
}
}
#[derive(Debug)]
pub enum VersionKind {
DeletionMarker,
Version(VersionId),
}
/// Options for downloads. The default value is a plain GET.
pub struct DownloadOpts {
/// If given, returns [`DownloadError::Unmodified`] if the object still has
@@ -212,8 +186,6 @@ pub struct DownloadOpts {
/// The end of the byte range to download, or unbounded. Must be after the
/// start bound.
pub byte_end: Bound<u64>,
/// Optionally request a specific version of a key
pub version_id: Option<VersionId>,
/// Indicate whether we're downloading something small or large: this indirectly controls
/// timeouts: for something like an index/manifest/heatmap, we should time out faster than
/// for layer files
@@ -225,16 +197,12 @@ pub enum DownloadKind {
Small,
}
#[derive(Debug, Clone)]
pub struct VersionId(pub String);
impl Default for DownloadOpts {
fn default() -> Self {
Self {
etag: Default::default(),
byte_start: Bound::Unbounded,
byte_end: Bound::Unbounded,
version_id: None,
kind: DownloadKind::Large,
}
}
@@ -327,14 +295,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
Ok(combined)
}
async fn list_versions(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<VersionListing, DownloadError>;
/// Obtain metadata information about an object.
async fn head_object(
&self,
@@ -515,22 +475,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
// See [`RemoteStorage::list_versions`].
pub async fn list_versions<'a>(
&'a self,
prefix: Option<&'a RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &'a CancellationToken,
) -> Result<VersionListing, DownloadError> {
match self {
Self::LocalFs(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
Self::AwsS3(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
Self::AzureBlob(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
Self::Unreliable(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
}
}
// See [`RemoteStorage::head_object`].
pub async fn head_object(
&self,
@@ -783,7 +727,6 @@ impl ConcurrencyLimiter {
RequestKind::Copy => &self.write,
RequestKind::TimeTravel => &self.write,
RequestKind::Head => &self.read,
RequestKind::ListVersions => &self.read,
}
}

View File

@@ -445,16 +445,6 @@ impl RemoteStorage for LocalFs {
}
}
async fn list_versions(
&self,
_prefix: Option<&RemotePath>,
_mode: ListingMode,
_max_keys: Option<NonZeroU32>,
_cancel: &CancellationToken,
) -> Result<crate::VersionListing, DownloadError> {
unimplemented!()
}
async fn head_object(
&self,
key: &RemotePath,

View File

@@ -14,7 +14,6 @@ pub(crate) enum RequestKind {
Copy = 4,
TimeTravel = 5,
Head = 6,
ListVersions = 7,
}
use RequestKind::*;
@@ -30,7 +29,6 @@ impl RequestKind {
Copy => "copy_object",
TimeTravel => "time_travel_recover",
Head => "head_object",
ListVersions => "list_versions",
}
}
const fn as_index(&self) -> usize {
@@ -38,10 +36,7 @@ impl RequestKind {
}
}
const REQUEST_KIND_LIST: &[RequestKind] =
&[Get, Put, Delete, List, Copy, TimeTravel, Head, ListVersions];
const REQUEST_KIND_COUNT: usize = REQUEST_KIND_LIST.len();
const REQUEST_KIND_COUNT: usize = 7;
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
impl<C> RequestTyped<C> {
@@ -50,11 +45,12 @@ impl<C> RequestTyped<C> {
}
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
let mut it = REQUEST_KIND_LIST.iter();
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(*next)
f(next)
});
if let Some(next) = it.next() {

View File

@@ -21,8 +21,9 @@ use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep};
use aws_sdk_s3::error::SdkError;
use aws_sdk_s3::operation::get_object::GetObjectError;
use aws_sdk_s3::operation::head_object::HeadObjectError;
use aws_sdk_s3::types::{Delete, ObjectIdentifier, StorageClass};
use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::DateTime;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use aws_smithy_types::date_time::ConversionError;
@@ -45,7 +46,7 @@ use crate::support::PermitCarrying;
use crate::{
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage,
TimeTravelError, TimeoutOrCancel, Version, VersionId, VersionKind, VersionListing,
TimeTravelError, TimeoutOrCancel,
};
/// AWS S3 storage.
@@ -65,7 +66,6 @@ struct GetObjectRequest {
key: String,
etag: Option<String>,
range: Option<String>,
version_id: Option<String>,
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
@@ -251,7 +251,6 @@ impl S3Bucket {
.get_object()
.bucket(request.bucket)
.key(request.key)
.set_version_id(request.version_id)
.set_range(request.range);
if let Some(etag) = request.etag {
@@ -406,124 +405,6 @@ impl S3Bucket {
Ok(())
}
async fn list_versions_with_permit(
&self,
_permit: &tokio::sync::SemaphorePermit<'_>,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<crate::VersionListing, DownloadError> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |e: &_| matches!(e, DownloadError::Cancelled);
let mut key_marker = None;
let mut version_id_marker = None;
let mut versions_and_deletes = Vec::new();
loop {
let response = backoff::retry(
|| async {
let mut request = self
.client
.list_object_versions()
.bucket(self.bucket_name.clone())
.set_prefix(prefix.clone())
.set_key_marker(key_marker.clone())
.set_version_id_marker(version_id_marker.clone());
if let ListingMode::WithDelimiter = mode {
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let op = request.send();
tokio::select! {
res = op => res.map_err(|e| DownloadError::Other(e.into())),
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions",
cancel,
)
.await
.ok_or_else(|| DownloadError::Cancelled)
.and_then(|x| x)?;
tracing::trace!(
" Got List response version_id_marker={:?}, key_marker={:?}",
response.version_id_marker,
response.key_marker
);
let versions = response
.versions
.unwrap_or_default()
.into_iter()
.map(|version| {
let key = version.key.expect("response does not contain a key");
let key = self.s3_object_to_relative_path(&key);
let version_id = VersionId(version.version_id.expect("needing version id"));
let last_modified =
SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
Ok(Version {
key,
last_modified,
kind: crate::VersionKind::Version(version_id),
})
});
let deletes = response
.delete_markers
.unwrap_or_default()
.into_iter()
.map(|version| {
let key = version.key.expect("response does not contain a key");
let key = self.s3_object_to_relative_path(&key);
let last_modified =
SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
Ok(Version {
key,
last_modified,
kind: crate::VersionKind::DeletionMarker,
})
});
itertools::process_results(versions.chain(deletes), |n_vds| {
versions_and_deletes.extend(n_vds)
})
.map_err(DownloadError::Other)?;
fn none_if_empty(v: Option<String>) -> Option<String> {
v.filter(|v| !v.is_empty())
}
version_id_marker = none_if_empty(response.next_version_id_marker);
key_marker = none_if_empty(response.next_key_marker);
if version_id_marker.is_none() {
// The final response is not supposed to be truncated
if response.is_truncated.unwrap_or_default() {
return Err(DownloadError::Other(anyhow::anyhow!(
"Received truncated ListObjectVersions response for prefix={prefix:?}"
)));
}
break;
}
if let Some(max_keys) = max_keys {
if versions_and_deletes.len() >= max_keys.get().try_into().unwrap() {
return Err(DownloadError::Other(anyhow::anyhow!("too many versions")));
}
}
}
Ok(VersionListing {
versions: versions_and_deletes,
})
}
pub fn bucket_name(&self) -> &str {
&self.bucket_name
}
@@ -740,19 +621,6 @@ impl RemoteStorage for S3Bucket {
}
}
async fn list_versions(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<crate::VersionListing, DownloadError> {
let kind = RequestKind::ListVersions;
let permit = self.permit(kind, cancel).await?;
self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
.await
}
async fn head_object(
&self,
key: &RemotePath,
@@ -933,7 +801,6 @@ impl RemoteStorage for S3Bucket {
key: self.relative_path_to_s3_object(from),
etag: opts.etag.as_ref().map(|e| e.to_string()),
range: opts.byte_range_header(),
version_id: opts.version_id.as_ref().map(|v| v.0.to_owned()),
},
cancel,
)
@@ -978,25 +845,94 @@ impl RemoteStorage for S3Bucket {
let kind = RequestKind::TimeTravel;
let permit = self.permit(kind, cancel).await?;
let timestamp = DateTime::from(timestamp);
let done_if_after = DateTime::from(done_if_after);
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
// Limit the number of versions deletions, mostly so that we don't
// keep requesting forever if the list is too long, as we'd put the
// list in RAM.
// Building a list of 100k entries that reaches the limit roughly takes
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
// get the passed prefix or if it is not set use prefix_in_bucket value
let prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
let mode = ListingMode::NoDelimiter;
let version_listing = self
.list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
let mut key_marker = None;
let mut version_id_marker = None;
let mut versions_and_deletes = Vec::new();
loop {
let response = backoff::retry(
|| async {
let op = self
.client
.list_object_versions()
.bucket(self.bucket_name.clone())
.set_prefix(prefix.clone())
.set_key_marker(key_marker.clone())
.set_version_id_marker(version_id_marker.clone())
.send();
tokio::select! {
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
}
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions for time_travel_recover",
cancel,
)
.await
.map_err(|err| match err {
DownloadError::Other(e) => TimeTravelError::Other(e),
DownloadError::Cancelled => TimeTravelError::Cancelled,
other => TimeTravelError::Other(other.into()),
})?;
let versions_and_deletes = version_listing.versions;
.ok_or_else(|| TimeTravelError::Cancelled)
.and_then(|x| x)?;
tracing::trace!(
" Got List response version_id_marker={:?}, key_marker={:?}",
response.version_id_marker,
response.key_marker
);
let versions = response
.versions
.unwrap_or_default()
.into_iter()
.map(VerOrDelete::from_version);
let deletes = response
.delete_markers
.unwrap_or_default()
.into_iter()
.map(VerOrDelete::from_delete_marker);
itertools::process_results(versions.chain(deletes), |n_vds| {
versions_and_deletes.extend(n_vds)
})
.map_err(TimeTravelError::Other)?;
fn none_if_empty(v: Option<String>) -> Option<String> {
v.filter(|v| !v.is_empty())
}
version_id_marker = none_if_empty(response.next_version_id_marker);
key_marker = none_if_empty(response.next_key_marker);
if version_id_marker.is_none() {
// The final response is not supposed to be truncated
if response.is_truncated.unwrap_or_default() {
return Err(TimeTravelError::Other(anyhow::anyhow!(
"Received truncated ListObjectVersions response for prefix={prefix:?}"
)));
}
break;
}
// Limit the number of versions deletions, mostly so that we don't
// keep requesting forever if the list is too long, as we'd put the
// list in RAM.
// Building a list of 100k entries that reaches the limit roughly takes
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
const COMPLEXITY_LIMIT: usize = 100_000;
if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
return Err(TimeTravelError::TooManyVersions);
}
}
tracing::info!(
"Built list for time travel with {} versions and deletions",
@@ -1012,26 +948,24 @@ impl RemoteStorage for S3Bucket {
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
for vd in &versions_and_deletes {
let Version { key, .. } = &vd;
let version_id = vd.version_id().map(|v| v.0.as_str());
if version_id == Some("null") {
let VerOrDelete {
version_id, key, ..
} = &vd;
if version_id == "null" {
return Err(TimeTravelError::Other(anyhow!(
"Received ListVersions response for key={key} with version_id='null', \
indicating either disabled versioning, or legacy objects with null version id values"
)));
}
tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);
tracing::trace!(
"Parsing version key={key} version_id={version_id} kind={:?}",
vd.kind
);
vds_for_key.entry(key).or_default().push(vd);
}
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
for (key, versions) in vds_for_key {
let last_vd = versions.last().unwrap();
let key = self.relative_path_to_s3_object(key);
if last_vd.last_modified > done_if_after {
tracing::trace!("Key {key} has version later than done_if_after, skipping");
continue;
@@ -1056,11 +990,11 @@ impl RemoteStorage for S3Bucket {
do_delete = true;
} else {
match &versions[version_to_restore_to - 1] {
Version {
kind: VersionKind::Version(version_id),
VerOrDelete {
kind: VerOrDeleteKind::Version,
version_id,
..
} => {
let version_id = &version_id.0;
tracing::trace!("Copying old version {version_id} for {key}...");
// Restore the state to the last version by copying
let source_id =
@@ -1072,7 +1006,7 @@ impl RemoteStorage for S3Bucket {
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(&key)
.key(key)
.set_storage_class(self.upload_storage_class.clone())
.copy_source(&source_id)
.send();
@@ -1093,8 +1027,8 @@ impl RemoteStorage for S3Bucket {
.and_then(|x| x)?;
tracing::info!(%version_id, %key, "Copied old version in S3");
}
Version {
kind: VersionKind::DeletionMarker,
VerOrDelete {
kind: VerOrDeleteKind::DeleteMarker,
..
} => {
do_delete = true;
@@ -1102,7 +1036,7 @@ impl RemoteStorage for S3Bucket {
}
};
if do_delete {
if matches!(last_vd.kind, VersionKind::DeletionMarker) {
if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
// Key has since been deleted (but there was some history), no need to do anything
tracing::trace!("Key {key} already deleted, skipping.");
} else {
@@ -1130,6 +1064,62 @@ impl RemoteStorage for S3Bucket {
}
}
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
struct VerOrDelete {
kind: VerOrDeleteKind,
last_modified: DateTime,
version_id: String,
key: String,
}
#[derive(Debug)]
enum VerOrDeleteKind {
Version,
DeleteMarker,
}
impl VerOrDelete {
fn with_kind(
kind: VerOrDeleteKind,
last_modified: Option<DateTime>,
version_id: Option<String>,
key: Option<String>,
) -> anyhow::Result<Self> {
let lvk = (last_modified, version_id, key);
let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
anyhow::bail!(
"One (or more) of last_modified, key, and id is None. \
Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
lvk.0,
lvk.1,
lvk.2,
);
};
Ok(Self {
kind,
last_modified,
version_id,
key,
})
}
fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
Self::with_kind(
VerOrDeleteKind::Version,
v.last_modified,
v.version_id,
v.key,
)
}
fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
Self::with_kind(
VerOrDeleteKind::DeleteMarker,
v.last_modified,
v.version_id,
v.key,
)
}
}
#[cfg(test)]
mod tests {
use std::num::NonZeroUsize;

View File

@@ -139,20 +139,6 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.list(prefix, mode, max_keys, cancel).await
}
async fn list_versions(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<crate::VersionListing, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner
.list_versions(prefix, mode, max_keys, cancel)
.await
}
async fn head_object(
&self,
key: &RemotePath,

View File

@@ -29,7 +29,6 @@ futures = { workspace = true }
jsonwebtoken.workspace = true
nix = { workspace = true, features = ["ioctl"] }
once_cell.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
regex.workspace = true
serde.workspace = true

View File

@@ -11,8 +11,7 @@ use camino::Utf8Path;
use jsonwebtoken::{
Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
};
use pem::Pem;
use serde::{Deserialize, Serialize, de::DeserializeOwned};
use serde::{Deserialize, Serialize};
use crate::id::TenantId;
@@ -74,10 +73,7 @@ impl SwappableJwtAuth {
pub fn swap(&self, jwt_auth: JwtAuth) {
self.0.swap(Arc::new(jwt_auth));
}
pub fn decode<D: DeserializeOwned>(
&self,
token: &str,
) -> std::result::Result<TokenData<D>, AuthError> {
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
self.0.load().decode(token)
}
}
@@ -152,10 +148,7 @@ impl JwtAuth {
/// The function tries the stored decoding keys in succession,
/// and returns the first yielding a successful result.
/// If there is no working decoding key, it returns the last error.
pub fn decode<D: DeserializeOwned>(
&self,
token: &str,
) -> std::result::Result<TokenData<D>, AuthError> {
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
let mut res = None;
for decoding_key in &self.decoding_keys {
res = Some(decode(token, decoding_key, &self.validation));
@@ -180,8 +173,8 @@ impl std::fmt::Debug for JwtAuth {
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn encode_from_key_file<S: Serialize>(claims: &S, pem: &Pem) -> Result<String> {
let key = EncodingKey::from_ed_der(pem.contents());
pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
let key = EncodingKey::from_ed_pem(key_data)?;
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
}
@@ -195,13 +188,13 @@ mod tests {
//
// openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
// openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
const TEST_PUB_KEY_ED25519: &str = r#"
const TEST_PUB_KEY_ED25519: &[u8] = br#"
-----BEGIN PUBLIC KEY-----
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
-----END PUBLIC KEY-----
"#;
const TEST_PRIV_KEY_ED25519: &str = r#"
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
-----BEGIN PRIVATE KEY-----
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
-----END PRIVATE KEY-----
@@ -229,9 +222,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
// Check it can be validated with the public key
let auth = JwtAuth::new(vec![
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
]);
let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims;
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
assert_eq!(claims_from_token, expected_claims);
}
@@ -242,14 +235,13 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
scope: Scope::Tenant,
};
let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
let encoded = encode_from_key_file(&claims, &pem).unwrap();
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
// decode it back
let auth = JwtAuth::new(vec![
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
]);
let decoded: TokenData<Claims> = auth.decode(&encoded).unwrap();
let decoded = auth.decode(&encoded).unwrap();
assert_eq!(decoded.claims, claims);
}

View File

@@ -1,5 +1,5 @@
[package]
name = "endpoint_storage"
name = "object_storage"
version = "0.0.1"
edition.workspace = true
license.workspace = true

View File

@@ -2,7 +2,7 @@ use anyhow::anyhow;
use axum::body::{Body, Bytes};
use axum::response::{IntoResponse, Response};
use axum::{Router, http::StatusCode};
use endpoint_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
use remote_storage::TimeoutOrCancel;
use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
@@ -46,12 +46,12 @@ async fn metrics() -> Result {
async fn get(S3Path { path }: S3Path, state: State) -> Result {
info!(%path, "downloading");
let download_err = |err| {
if let DownloadError::NotFound = err {
info!(%path, %err, "downloading"); // 404 is not an issue of _this_ service
let download_err = |e| {
if let DownloadError::NotFound = e {
info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
return not_found(&path);
}
internal_error(err, &path, "downloading")
internal_error(e, &path, "downloading")
};
let cancel = state.cancel.clone();
let opts = &DownloadOpts::default();
@@ -249,7 +249,7 @@ mod tests {
};
let proxy = Storage {
auth: endpoint_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
storage,
cancel: cancel.clone(),
max_upload_file_limit: usize::MAX,
@@ -343,14 +343,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
fn token() -> String {
let claims = endpoint_storage::Claims {
let claims = object_storage::Claims {
tenant_id: TENANT_ID,
timeline_id: TIMELINE_ID,
endpoint_id: ENDPOINT_ID.into(),
exp: u64::MAX,
};
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
jsonwebtoken::encode(&header, &claims, &key).unwrap()
}
@@ -364,10 +364,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
vec![ENDPOINT_ID, "ep-ololo"]
)
// first one is fully valid path, second path is valid for GET as
// read paths may have different endpoint if tenant and timeline matches
// (needed for prewarming RO->RW replica)
.skip(2);
.skip(1);
for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
info!(%uri, %method, %tenant, %timeline, %endpoint);
@@ -478,16 +475,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
requests_chain(chain.into_iter(), |_| token()).await;
}
#[testlog(tokio::test)]
async fn read_other_endpoint_data() {
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/other_endpoint/key");
let chain = vec![
(uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
(uri.clone(), "PUT", "", StatusCode::UNAUTHORIZED, false),
];
requests_chain(chain.into_iter(), |_| token()).await;
}
fn delete_prefix_token(uri: &str) -> String {
use serde::Serialize;
let parts = uri.split("/").collect::<Vec<&str>>();
@@ -495,7 +482,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
struct PrefixClaims {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
endpoint_id: Option<endpoint_storage::EndpointId>,
endpoint_id: Option<object_storage::EndpointId>,
exp: u64,
}
let claims = PrefixClaims {
@@ -505,7 +492,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
exp: u64::MAX,
};
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
jsonwebtoken::encode(&header, &claims, &key).unwrap()
}

View File

@@ -169,19 +169,10 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
.auth
.decode(bearer.token())
.map_err(|e| bad_request(e, "decoding token"))?;
// Read paths may have different endpoint ids. For readonly -> readwrite replica
// prewarming, endpoint must read other endpoint's data.
let endpoint_id = if parts.method == axum::http::Method::GET {
claims.endpoint_id.clone()
} else {
path.endpoint_id.clone()
};
let route = Claims {
tenant_id: path.tenant_id,
timeline_id: path.timeline_id,
endpoint_id,
endpoint_id: path.endpoint_id.clone(),
exp: claims.exp,
};
if route != claims {

View File

@@ -1,4 +1,4 @@
//! `endpoint_storage` is a service which provides API for uploading and downloading
//! `object_storage` is a service which provides API for uploading and downloading
//! files. It is used by compute and control plane for accessing LFC prewarm data.
//! This service is deployed either as a separate component or as part of compute image
//! for large computes.
@@ -33,7 +33,7 @@ async fn main() -> anyhow::Result<()> {
let config: String = std::env::args().skip(1).take(1).collect();
if config.is_empty() {
anyhow::bail!("Usage: endpoint_storage config.json")
anyhow::bail!("Usage: object_storage config.json")
}
info!("Reading config from {config}");
let config = std::fs::read_to_string(config.clone())?;
@@ -41,7 +41,7 @@ async fn main() -> anyhow::Result<()> {
info!("Reading pemfile from {}", config.pemfile.clone());
let pemfile = std::fs::read(config.pemfile.clone())?;
info!("Loading public key from {}", config.pemfile.clone());
let auth = endpoint_storage::JwtAuth::new(&pemfile)?;
let auth = object_storage::JwtAuth::new(&pemfile)?;
let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
info!("listening on {}", listener.local_addr().unwrap());
@@ -50,7 +50,7 @@ async fn main() -> anyhow::Result<()> {
let cancel = tokio_util::sync::CancellationToken::new();
app::check_storage_permissions(&storage, cancel.clone()).await?;
let proxy = std::sync::Arc::new(endpoint_storage::Storage {
let proxy = std::sync::Arc::new(object_storage::Storage {
auth,
storage,
cancel: cancel.clone(),

View File

@@ -10,8 +10,6 @@ default = []
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
fuzz-read-path = ["testing"]
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true
@@ -35,7 +33,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper0.workspace = true
itertools.workspace = true
jsonwebtoken.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
@@ -78,7 +75,6 @@ metrics.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pem.workspace = true
postgres_connection.workspace = true
postgres_ffi.workspace = true
pq_proto.workspace = true

View File

@@ -11,7 +11,6 @@ use pageserver::task_mgr::TaskKind;
use pageserver::tenant::storage_layer::InMemoryLayer;
use pageserver::{page_cache, virtual_file};
use pageserver_api::key::Key;
use pageserver_api::models::virtual_file::IoMode;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use tokio_util::sync::CancellationToken;
@@ -22,14 +21,13 @@ use wal_decoder::serialized_batch::SerializedValueBatch;
// A very cheap hash for generating non-sequential keys.
fn murmurhash32(mut h: u32) -> u32 {
h ^= h >> 16;
h h.wrapping_mul(0x85ebca6b);
h = h.wrapping_mul(0x85ebca6b);
h ^= h >> 13;
h = h.wrapping_mul(0xc2b2ae35);
h ^= h >> 16;
h
}
#[derive(serde::Serialize, Clone, Copy, Debug)]
enum KeyLayout {
/// Sequential unique keys
Sequential,
@@ -39,7 +37,6 @@ enum KeyLayout {
RandomReuse(u32),
}
#[derive(serde::Serialize, Clone, Copy, Debug)]
enum WriteDelta {
Yes,
No,
@@ -129,7 +126,7 @@ async fn ingest(
max_concurrency: NonZeroUsize::new(1).unwrap(),
});
let (_desc, path) = layer
.write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
.write_to_disk(&ctx, None, l0_flush_state.inner())
.await?
.unwrap();
tokio::fs::remove_file(path).await?;
@@ -141,15 +138,12 @@ async fn ingest(
/// Wrapper to instantiate a tokio runtime
fn ingest_main(
conf: &'static PageServerConf,
io_mode: IoMode,
put_size: usize,
put_count: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
) {
pageserver::virtual_file::set_io_mode(io_mode);
let runtime = tokio::runtime::Builder::new_multi_thread()
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
@@ -180,207 +174,93 @@ fn criterion_benchmark(c: &mut Criterion) {
virtual_file::init(
16384,
virtual_file::io_engine_for_bench(),
// immaterial, each `ingest_main` invocation below overrides this
conf.virtual_file_io_mode,
// without actually doing syncs, buffered writes have an unfair advantage over direct IO writes
virtual_file::SyncMode::Sync,
);
page_cache::init(conf.page_cache_size);
#[derive(serde::Serialize)]
struct ExplodedParameters {
io_mode: IoMode,
volume_mib: usize,
key_size: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
}
#[derive(Clone)]
struct HandPickedParameters {
volume_mib: usize,
key_size: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
}
let expect = vec![
// Small values (100b) tests
HandPickedParameters {
volume_mib: 128,
key_size: 100,
key_layout: KeyLayout::Sequential,
write_delta: WriteDelta::Yes,
},
HandPickedParameters {
volume_mib: 128,
key_size: 100,
key_layout: KeyLayout::Random,
write_delta: WriteDelta::Yes,
},
HandPickedParameters {
volume_mib: 128,
key_size: 100,
key_layout: KeyLayout::RandomReuse(0x3ff),
write_delta: WriteDelta::Yes,
},
HandPickedParameters {
volume_mib: 128,
key_size: 100,
key_layout: KeyLayout::Sequential,
write_delta: WriteDelta::No,
},
// Large values (8k) tests
HandPickedParameters {
volume_mib: 128,
key_size: 8192,
key_layout: KeyLayout::Sequential,
write_delta: WriteDelta::Yes,
},
HandPickedParameters {
volume_mib: 128,
key_size: 8192,
key_layout: KeyLayout::Sequential,
write_delta: WriteDelta::No,
},
];
let exploded_parameters = {
let mut out = Vec::new();
for io_mode in [
IoMode::Buffered,
#[cfg(target_os = "linux")]
IoMode::Direct,
] {
for param in expect.clone() {
let HandPickedParameters {
volume_mib,
key_size,
key_layout,
write_delta,
} = param;
out.push(ExplodedParameters {
io_mode,
volume_mib,
key_size,
key_layout,
write_delta,
});
}
}
out
};
impl ExplodedParameters {
fn benchmark_id(&self) -> String {
let ExplodedParameters {
io_mode,
volume_mib,
key_size,
key_layout,
write_delta,
} = self;
format!(
"io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
)
}
}
let mut group = c.benchmark_group("ingest");
for params in exploded_parameters {
let id = params.benchmark_id();
let ExplodedParameters {
io_mode,
volume_mib,
key_size,
key_layout,
write_delta,
} = params;
let put_count = volume_mib * 1024 * 1024 / key_size;
group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
{
let mut group = c.benchmark_group("ingest-small-values");
let put_size = 100usize;
let put_count = 128 * 1024 * 1024 / put_size;
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
group.sample_size(10);
group.bench_function(id, |b| {
b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
group.bench_function("ingest 128MB/100b seq", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b rand", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Random,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::RandomReuse(0x3ff),
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b seq, no delta", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::No,
)
})
});
}
{
let mut group = c.benchmark_group("ingest-big-values");
let put_size = 8192usize;
let put_count = 128 * 1024 * 1024 / put_size;
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
group.sample_size(10);
group.bench_function("ingest 128MB/8k seq", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/8k seq, no delta", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::No,
)
})
});
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
/*
cargo bench --bench bench_ingest
im4gn.2xlarge:
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
time: [1.8491 s 1.8540 s 1.8592 s]
thrpt: [68.847 MiB/s 69.039 MiB/s 69.222 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
time: [2.6976 s 2.7123 s 2.7286 s]
thrpt: [46.911 MiB/s 47.193 MiB/s 47.450 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
time: [1.7433 s 1.7510 s 1.7600 s]
thrpt: [72.729 MiB/s 73.099 MiB/s 73.423 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
time: [499.63 ms 500.07 ms 500.46 ms]
thrpt: [255.77 MiB/s 255.96 MiB/s 256.19 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
time: [456.97 ms 459.61 ms 461.92 ms]
thrpt: [277.11 MiB/s 278.50 MiB/s 280.11 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
time: [158.82 ms 159.16 ms 159.56 ms]
thrpt: [802.22 MiB/s 804.24 MiB/s 805.93 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
time: [1.8856 s 1.8997 s 1.9179 s]
thrpt: [66.740 MiB/s 67.380 MiB/s 67.882 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
time: [2.7468 s 2.7625 s 2.7785 s]
thrpt: [46.068 MiB/s 46.335 MiB/s 46.600 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
time: [1.7689 s 1.7726 s 1.7767 s]
thrpt: [72.045 MiB/s 72.208 MiB/s 72.363 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
time: [497.64 ms 498.60 ms 499.67 ms]
thrpt: [256.17 MiB/s 256.72 MiB/s 257.21 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
time: [493.72 ms 505.07 ms 518.03 ms]
thrpt: [247.09 MiB/s 253.43 MiB/s 259.26 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
time: [267.76 ms 267.85 ms 267.96 ms]
thrpt: [477.69 MiB/s 477.88 MiB/s 478.03 MiB/s]
Hetzner AX102:
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
time: [1.0683 s 1.1006 s 1.1386 s]
thrpt: [112.42 MiB/s 116.30 MiB/s 119.82 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
time: [1.5719 s 1.6012 s 1.6228 s]
thrpt: [78.877 MiB/s 79.938 MiB/s 81.430 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
time: [1.1095 s 1.1331 s 1.1580 s]
thrpt: [110.53 MiB/s 112.97 MiB/s 115.37 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
time: [303.20 ms 307.83 ms 311.90 ms]
thrpt: [410.39 MiB/s 415.81 MiB/s 422.16 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
time: [406.34 ms 429.37 ms 451.63 ms]
thrpt: [283.42 MiB/s 298.11 MiB/s 315.00 MiB/s]
ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
time: [134.01 ms 135.78 ms 137.48 ms]
thrpt: [931.03 MiB/s 942.68 MiB/s 955.12 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
time: [1.0406 s 1.0580 s 1.0772 s]
thrpt: [118.83 MiB/s 120.98 MiB/s 123.00 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
time: [1.5059 s 1.5339 s 1.5625 s]
thrpt: [81.920 MiB/s 83.448 MiB/s 84.999 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
time: [1.0714 s 1.0934 s 1.1161 s]
thrpt: [114.69 MiB/s 117.06 MiB/s 119.47 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
time: [262.68 ms 265.14 ms 267.71 ms]
thrpt: [478.13 MiB/s 482.76 MiB/s 487.29 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
time: [375.19 ms 393.80 ms 411.40 ms]
thrpt: [311.14 MiB/s 325.04 MiB/s 341.16 MiB/s]
ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
time: [123.02 ms 123.85 ms 124.66 ms]
thrpt: [1.0027 GiB/s 1.0093 GiB/s 1.0161 GiB/s]
*/

View File

@@ -419,23 +419,6 @@ impl Client {
}
}
pub async fn timeline_detail(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
self.request(Method::GET, &uri, ())
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn timeline_archival_config(
&self,
tenant_shard_id: TenantShardId,

View File

@@ -68,13 +68,6 @@ pub(crate) struct Args {
targets: Option<Vec<TenantTimelineId>>,
}
/// State shared by all clients
#[derive(Debug)]
struct SharedState {
start_work_barrier: tokio::sync::Barrier,
live_stats: LiveStats,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
@@ -247,26 +240,24 @@ async fn main_impl(
all_ranges
};
let live_stats = Arc::new(LiveStats::default());
let num_live_stats_dump = 1;
let num_work_sender_tasks = args.num_clients.get() * timelines.len();
let num_main_impl = 1;
let shared_state = Arc::new(SharedState {
start_work_barrier: tokio::sync::Barrier::new(
num_live_stats_dump + num_work_sender_tasks + num_main_impl,
),
live_stats: LiveStats::default(),
});
let cancel = CancellationToken::new();
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_live_stats_dump + num_work_sender_tasks + num_main_impl,
));
let ss = shared_state.clone();
tokio::spawn({
let stats = Arc::clone(&live_stats);
let start_work_barrier = Arc::clone(&start_work_barrier);
async move {
ss.start_work_barrier.wait().await;
start_work_barrier.wait().await;
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let stats = &ss.live_stats;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let missed = stats.missed.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
@@ -279,12 +270,14 @@ async fn main_impl(
}
});
let cancel = CancellationToken::new();
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();
let live_stats = live_stats.clone();
let start_work_barrier = start_work_barrier.clone();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == worker_id.timeline)
@@ -294,8 +287,85 @@ async fn main_impl(
rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
.unwrap();
let cancel = cancel.clone();
Box::pin(async move {
client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
let client =
pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
.await
.unwrap();
start_work_barrier.wait().await;
let client_start = Instant::now();
let mut ticks_processed = 0;
let mut inflight = VecDeque::new();
while !cancel.is_cancelled() {
// Detect if a request took longer than the RPS rate
if let Some(period) = &rps_period {
let periods_passed_until_now =
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
.unwrap();
if periods_passed_until_now > ticks_processed {
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
}
ticks_processed = periods_passed_until_now;
}
while inflight.len() < args.queue_depth.get() {
let start = Instant::now();
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
},
rel: rel_tag,
blkno: block_no,
}
};
client.getpage_send(req).await.unwrap();
inflight.push_back(start);
}
let start = inflight.pop_front().unwrap();
client.getpage_recv().await.unwrap();
let end = Instant::now();
live_stats.request_done();
ticks_processed += 1;
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
});
if let Some(period) = &rps_period {
let next_at = client_start
+ Duration::from_micros(
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
);
tokio::time::sleep_until(next_at.into()).await;
}
}
})
};
@@ -317,7 +387,7 @@ async fn main_impl(
};
info!("waiting for everything to become ready");
shared_state.start_work_barrier.wait().await;
start_work_barrier.wait().await;
info!("work started");
if let Some(runtime) = args.runtime {
tokio::time::sleep(runtime.into()).await;
@@ -346,91 +416,3 @@ async fn main_impl(
anyhow::Ok(())
}
async fn client_libpq(
args: &Args,
worker_id: WorkerId,
shared_state: Arc<SharedState>,
cancel: CancellationToken,
rps_period: Option<Duration>,
ranges: Vec<KeyRange>,
weights: rand::distributions::weighted::WeightedIndex<i128>,
) {
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
.await
.unwrap();
shared_state.start_work_barrier.wait().await;
let client_start = Instant::now();
let mut ticks_processed = 0;
let mut inflight = VecDeque::new();
while !cancel.is_cancelled() {
// Detect if a request took longer than the RPS rate
if let Some(period) = &rps_period {
let periods_passed_until_now =
usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
if periods_passed_until_now > ticks_processed {
shared_state
.live_stats
.missed((periods_passed_until_now - ticks_processed) as u64);
}
ticks_processed = periods_passed_until_now;
}
while inflight.len() < args.queue_depth.get() {
let start = Instant::now();
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
},
rel: rel_tag,
blkno: block_no,
}
};
client.getpage_send(req).await.unwrap();
inflight.push_back(start);
}
let start = inflight.pop_front().unwrap();
client.getpage_recv().await.unwrap();
let end = Instant::now();
shared_state.live_stats.request_done();
ticks_processed += 1;
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
});
if let Some(period) = &rps_period {
let next_at = client_start
+ Duration::from_micros(
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
);
tokio::time::sleep_until(next_at.into()).await;
}
}
}

View File

@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
#[derive(Debug, thiserror::Error)]
@@ -353,10 +353,9 @@ where
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
let blocks = self
.timeline
.get_vectored(query, self.io_concurrency.clone(), self.ctx)
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
.await?;
for (key, block) in blocks {

View File

@@ -9,7 +9,7 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, anyhow};
use anyhow::{Context, anyhow, bail};
use camino::Utf8Path;
use clap::{Arg, ArgAction, Command};
use http_utils::tls_certs::ReloadingCertificateResolver;
@@ -79,6 +79,8 @@ fn main() -> anyhow::Result<()> {
return Ok(());
}
let dev_mode = arg_matches.get_flag("dev");
// Initialize up failpoints support
let scenario = failpoint_support::init();
@@ -99,6 +101,20 @@ fn main() -> anyhow::Result<()> {
let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
if !dev_mode {
if matches!(conf.http_auth_type, AuthType::Trust)
|| matches!(conf.pg_auth_type, AuthType::Trust)
{
bail!(
"Pageserver refuses to start with HTTP or PostgreSQL API authentication disabled.\n\
Run with --dev to allow running without authentication.\n\
This is insecure and should only be used in development environments."
);
}
} else {
warn!("Starting in dev mode: this may be an insecure configuration.");
}
// Initialize logging.
//
// It must be initialized before the custom panic hook is installed below.
@@ -416,18 +432,8 @@ fn start_pageserver(
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
let broker_client = WALRECEIVER_RUNTIME
.block_on(async {
let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates(
conf.ssl_ca_certs
.iter()
.map(pem::encode)
.map(storage_broker::Certificate::from_pem),
);
// Note: we do not attempt connecting here (but validate endpoints sanity).
storage_broker::connect(
conf.broker_endpoint.clone(),
conf.broker_keepalive_interval,
tls_config,
)
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
})
.with_context(|| {
format!(
@@ -462,24 +468,6 @@ fn start_pageserver(
info!("Using auth for http API: {:#?}", conf.http_auth_type);
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
{
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
"main",
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
Some(Arc::new(server_config))
} else {
None
};
match var("NEON_AUTH_TOKEN") {
Ok(v) => {
info!("Loaded JWT token for authentication with Safekeeper");
@@ -698,11 +686,17 @@ fn start_pageserver(
let https_task = match https_listener {
Some(https_listener) => {
let tls_server_config = tls_server_config
.clone()
.expect("tls_server_config is set earlier if https is enabled");
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
let server =
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -758,11 +752,6 @@ fn start_pageserver(
tokio::net::TcpListener::from_std(pageserver_listener)
.context("create tokio listener")?
},
if conf.enable_tls_page_service_api {
tls_server_config
} else {
None
},
);
// All started up! Now just sit and wait for shutdown signal.
@@ -843,6 +832,12 @@ fn cli() -> Command {
.action(ArgAction::SetTrue)
.help("Show enabled compile time features"),
)
.arg(
Arg::new("dev")
.long("dev")
.action(ArgAction::SetTrue)
.help("Run in development mode (disables security checks)"),
)
}
#[test]

View File

@@ -17,10 +17,9 @@ use once_cell::sync::OnceCell;
use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use pem::Pem;
use postgres_backend::AuthType;
use remote_storage::{RemotePath, RemoteStorageConfig};
use reqwest::Url;
use reqwest::{Certificate, Url};
use storage_broker::Uri;
use utils::id::{NodeId, TimelineId};
use utils::logging::{LogFormat, SecretString};
@@ -68,8 +67,8 @@ pub struct PageServerConf {
/// Period to reload certificate and private key from files.
/// Default: 60s.
pub ssl_cert_reload_period: Duration,
/// Trusted root CA certificates to use in https APIs in PEM format.
pub ssl_ca_certs: Vec<Pem>,
/// Trusted root CA certificates to use in https APIs.
pub ssl_ca_certs: Vec<Certificate>,
/// Current availability zone. Used for traffic metrics.
pub availability_zone: Option<String>,
@@ -119,13 +118,13 @@ pub struct PageServerConf {
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
/// See the comment in `eviction_task` for details.
///
/// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint.
@@ -220,16 +219,6 @@ pub struct PageServerConf {
pub generate_unarchival_heatmap: bool,
pub tracing: Option<pageserver_api::config::Tracing>,
/// Enable TLS in page service API.
/// Does not force TLS: the client negotiates TLS usage during the handshake.
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
pub enable_tls_page_service_api: bool,
/// Run in development mode, which disables certain safety checks
/// such as authentication requirements for HTTP and PostgreSQL APIs.
/// This is insecure and should only be used in development environments.
pub dev_mode: bool,
}
/// Token for authentication to safekeepers
@@ -402,8 +391,6 @@ impl PageServerConf {
load_previous_heatmap,
generate_unarchival_heatmap,
tracing,
enable_tls_page_service_api,
dev_mode,
} = config_toml;
let mut conf = PageServerConf {
@@ -454,8 +441,6 @@ impl PageServerConf {
page_service_pipelining,
get_vectored_concurrent_io,
tracing,
enable_tls_page_service_api,
dev_mode,
// ------------------------------------------------------------
// fields that require additional validation or custom handling
@@ -505,10 +490,7 @@ impl PageServerConf {
ssl_ca_certs: match ssl_ca_file {
Some(ssl_ca_file) => {
let buf = std::fs::read(ssl_ca_file)?;
pem::parse_many(&buf)?
.into_iter()
.filter(|pem| pem.tag() == "CERTIFICATE")
.collect()
Certificate::from_pem_bundle(&buf)?
}
None => Vec::new(),
},
@@ -599,10 +581,10 @@ impl ConfigurableSemaphore {
/// Initializse using a non-zero amount of permits.
///
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
/// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
/// behave like [`futures::future::pending`], just waiting until new permits are added.
///
/// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub fn new(initial_permits: NonZeroUsize) -> Self {
ConfigurableSemaphore {
initial_permits,

View File

@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
use crate::tenant::mgr::TenantManager;
use crate::tenant::size::CalculateSyntheticSizeError;
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
use crate::tenant::{LogicalSizeCalculationCause, Tenant};
mod disk_cache;
mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
}
}
async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
const CAUSE: LogicalSizeCalculationCause =
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

View File

@@ -175,9 +175,9 @@ impl MetricsKey {
.absolute_values()
}
/// [`TenantShard::remote_size`]
/// [`Tenant::remote_size`]
///
/// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
.absolute_values()
}
/// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
///
/// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
/// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
@@ -254,7 +254,7 @@ pub(super) async fn collect_all_metrics(
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
where
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
{
let mut current_metrics: Vec<NewRawMetric> = Vec::new();
@@ -263,9 +263,7 @@ where
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
let timelines = tenant.list_timelines();
let timelines_len = timelines.len();
for timeline in timelines {
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;
match TimelineSnapshot::collect(&timeline, ctx) {
@@ -291,11 +289,6 @@ where
tenant_resident_size += timeline.resident_physical_size();
}
if timelines_len == 0 {
// Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
tenant_resident_size = 1;
}
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
}
@@ -315,7 +308,7 @@ impl TenantSnapshot {
///
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
/// cannot just list timelines here.
fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
TenantSnapshot {
resident_size,
remote_size: t.remote_size(),

View File

@@ -3,19 +3,17 @@ use std::collections::HashMap;
use futures::Future;
use pageserver_api::config::NodeMetadata;
use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest};
use pageserver_api::models::ShardImportStatus;
use pageserver_api::shard::TenantShardId;
use pageserver_api::upcall_api::{
PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
ValidateRequest, ValidateRequestTenant, ValidateResponse,
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
ValidateRequestTenant, ValidateResponse,
};
use reqwest::Certificate;
use serde::Serialize;
use serde::de::DeserializeOwned;
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::generation::Generation;
use utils::id::{NodeId, TimelineId};
use utils::id::NodeId;
use utils::{backoff, failpoint_support};
use crate::config::PageServerConf;
@@ -47,12 +45,6 @@ pub trait StorageControllerUpcallApi {
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
fn put_timeline_import_status(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
status: ShardImportStatus,
) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
}
impl StorageControllerUpcallClient {
@@ -84,8 +76,8 @@ impl StorageControllerUpcallClient {
client = client.default_headers(headers);
}
for cert in &conf.ssl_ca_certs {
client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
for ssl_ca_cert in &conf.ssl_ca_certs {
client = client.add_root_certificate(ssl_ca_cert.clone());
}
Ok(Some(Self {
@@ -280,30 +272,4 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
Ok(result.into_iter().collect())
}
/// Send a shard import status to the storage controller
///
/// The implementation must have at-least-once delivery semantics.
/// To this end, we retry the request until it succeeds. If the pageserver
/// restarts or crashes, the shard import will start again from the beggining.
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
async fn put_timeline_import_status(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
status: ShardImportStatus,
) -> Result<(), RetryForeverError> {
let url = self
.base_url
.join("timeline_import_status")
.expect("Failed to build path");
let request = PutTimelineImportStatusRequest {
tenant_shard_id,
timeline_id,
status,
};
self.retry_http_forever(&url, request).await
}
}

View File

@@ -787,15 +787,6 @@ mod test {
Ok(result)
}
async fn put_timeline_import_status(
&self,
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
_status: pageserver_api::models::ShardImportStatus,
) -> Result<(), RetryForeverError> {
unimplemented!()
}
}
async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {

View File

@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
&ShardParameters::default(),
);
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
&ShardParameters::default(),
);
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
.map_err(ApiError::InternalServerError)?;
}
modification
.commit(&ctx)

View File

@@ -27,7 +27,7 @@ use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::{WalIngest, WalIngestErrorKind};
use crate::walingest::WalIngest;
// Returns checkpoint LSN from controlfile
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
.put_rel_creation(rel, nblocks as u32, ctx)
.await
{
match e.kind {
WalIngestErrorKind::RelationAlreadyExists(rel) => {
debug!("Relation {rel} already exists. We must be extending it.")
match e {
RelationError::AlreadyExists => {
debug!("Relation {} already exist. We must be extending it.", rel)
}
_ => return Err(e.into()),
}

View File

@@ -49,7 +49,7 @@ use tracing::{info, info_span};
/// backwards-compatible changes to the metadata format.
pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const DEFAULT_PG_VERSION: u32 = 17;
pub const DEFAULT_PG_VERSION: u32 = 16;
// Magic constants used to identify different kinds of files
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;

View File

@@ -17,7 +17,7 @@ use metrics::{
use once_cell::sync::Lazy;
use pageserver_api::config::{
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
@@ -1086,7 +1086,7 @@ pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
.expect("Failed to register metric")
});
/// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
/// like how long it took to load.
///
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
@@ -1714,28 +1714,6 @@ pub enum SmgrQueryType {
Test,
}
#[derive(
Debug,
Clone,
Copy,
IntoStaticStr,
strum_macros::EnumCount,
strum_macros::EnumIter,
strum_macros::FromRepr,
enum_map::Enum,
)]
#[strum(serialize_all = "snake_case")]
pub enum GetPageBatchBreakReason {
BatchFull,
NonBatchableRequest,
NonUniformLsn,
SamePageAtDifferentLsn,
NonUniformTimeline,
ExecutorSteal,
#[cfg(feature = "testing")]
NonUniformKey,
}
pub(crate) struct SmgrQueryTimePerTimeline {
global_started: [IntCounter; SmgrQueryType::COUNT],
global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1747,8 +1725,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros: IntCounter,
global_batch_wait_time: Histogram,
per_timeline_batch_wait_time: Histogram,
global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
throttling: Arc<tenant_throttling::Pagestream>,
}
@@ -1882,55 +1858,12 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
.expect("failed to define a metric")
});
static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_page_service_batch_break_reason_global",
"Reason for breaking batches of get page requests",
&["reason"],
)
.expect("failed to define a metric")
});
struct GetPageBatchBreakReasonTimelineMetrics {
map: EnumMap<GetPageBatchBreakReason, IntCounter>,
}
impl GetPageBatchBreakReasonTimelineMetrics {
fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
GetPageBatchBreakReasonTimelineMetrics {
map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
let reason = GetPageBatchBreakReason::from_usize(reason_idx);
PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
tenant_id,
shard_slug,
timeline_id,
reason.into(),
])
})),
}
}
fn inc(&self, reason: GetPageBatchBreakReason) {
self.map[reason].inc()
}
}
static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_service_batch_break_reason",
"Reason for breaking batches of get page requests",
&["tenant_id", "shard_id", "timeline_id", "reason"],
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_page_service_config_max_batch_size",
"Configured maximum batch size for the server-side batching functionality of page_service. \
Labels expose more of the configuration parameters.",
&["mode", "execution", "batching"]
&["mode", "execution"]
)
.expect("failed to define a metric")
});
@@ -1938,11 +1871,10 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
let (label_values, value) = match conf {
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
max_batch_size,
execution,
batching,
}) => {
let mode = "pipelined";
let execution = match execution {
@@ -1951,12 +1883,7 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
}
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
};
let batching = match batching {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
};
([mode, execution, batching], max_batch_size.get())
([mode, execution], max_batch_size.get())
}
};
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
@@ -2052,15 +1979,6 @@ impl SmgrQueryTimePerTimeline {
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
let global_batch_break_reason = std::array::from_fn(|i| {
let reason = GetPageBatchBreakReason::from_usize(i);
PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
.get_metric_with_label_values(&[reason.into()])
.unwrap()
});
let per_timeline_batch_break_reason =
GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
let global_flush_in_progress_micros =
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -2078,8 +1996,6 @@ impl SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros,
global_batch_wait_time,
per_timeline_batch_wait_time,
global_batch_break_reason,
per_timeline_batch_break_reason,
throttling: pagestream_throttle_metrics,
}
}
@@ -2108,16 +2024,9 @@ impl SmgrQueryTimePerTimeline {
}
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
pub(crate) fn observe_getpage_batch_start(
&self,
batch_size: usize,
break_reason: GetPageBatchBreakReason,
) {
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
self.global_batch_size.observe(batch_size as f64);
self.per_timeline_batch_size.observe(batch_size as f64);
self.global_batch_break_reason[break_reason.into_usize()].inc();
self.per_timeline_batch_break_reason.inc(break_reason);
}
}
@@ -3483,15 +3392,6 @@ impl TimelineMetrics {
shard_id,
timeline_id,
]);
for reason in GetPageBatchBreakReason::iter() {
let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
reason.into(),
]);
}
}
}
@@ -4370,7 +4270,6 @@ pub fn preinitialize_metrics(
[
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
&SMGR_QUERY_STARTED_GLOBAL,
&PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
]
.into_iter()
.for_each(|c| {

View File

@@ -15,11 +15,10 @@ use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use futures::FutureExt;
use itertools::Itertools;
use jsonwebtoken::TokenData;
use once_cell::sync::OnceCell;
use pageserver_api::config::{
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::models::{
@@ -59,8 +58,8 @@ use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
TimelineMetrics,
};
use crate::pgdatadir_mapping::Version;
use crate::span::{
@@ -76,7 +75,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
use crate::{basebackup, timed_after_cancellation};
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
/// is not yet in state [`TenantState::Active`].
///
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
@@ -106,7 +105,6 @@ pub fn spawn(
pg_auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -126,7 +124,6 @@ pub fn spawn(
perf_trace_dispatch,
tcp_listener,
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
libpq_ctx,
cancel.clone(),
@@ -184,7 +181,6 @@ pub async fn libpq_listener_main(
perf_trace_dispatch: Option<Dispatch>,
listener: tokio::net::TcpListener,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
@@ -227,7 +223,6 @@ pub async fn libpq_listener_main(
local_auth,
socket,
auth_type,
tls_config.clone(),
pipelining_config.clone(),
connection_ctx,
connections_cancel.child_token(),
@@ -269,7 +264,6 @@ async fn page_service_conn_main(
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
connection_ctx: RequestContext,
cancel: CancellationToken,
@@ -340,8 +334,7 @@ async fn page_service_conn_main(
cancel.clone(),
gate_guard,
);
let pgbackend =
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
match pgbackend.run(&mut conn_handler, &cancel).await {
Ok(()) => {
@@ -642,7 +635,6 @@ impl std::fmt::Display for BatchedPageStreamError {
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
effective_request_lsn: Lsn,
ctx: RequestContext,
}
@@ -672,8 +664,8 @@ enum BatchedFeMessage {
GetPage {
span: Span,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
batch_break_reason: GetPageBatchBreakReason,
},
DbSize {
span: Span,
@@ -726,119 +718,6 @@ impl BatchedFeMessage {
BatchedFeMessage::RespondError { .. } => {}
}
}
fn should_break_batch(
&self,
other: &BatchedFeMessage,
max_batch_size: NonZeroUsize,
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
) -> Option<GetPageBatchBreakReason> {
match (self, other) {
(
BatchedFeMessage::GetPage {
shard: accum_shard,
pages: accum_pages,
..
},
BatchedFeMessage::GetPage {
shard: this_shard,
pages: this_pages,
..
},
) => {
assert_eq!(this_pages.len(), 1);
if accum_pages.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_pages.len(), max_batch_size.get());
return Some(GetPageBatchBreakReason::BatchFull);
}
if !accum_shard.is_same_handle_as(this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return Some(GetPageBatchBreakReason::NonUniformTimeline);
}
match batching_strategy {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
if let Some(last_in_batch) = accum_pages.last() {
if last_in_batch.effective_request_lsn
!= this_pages[0].effective_request_lsn
{
trace!(
accum_lsn = %last_in_batch.effective_request_lsn,
this_lsn = %this_pages[0].effective_request_lsn,
"stopping batching because LSN changed"
);
return Some(GetPageBatchBreakReason::NonUniformLsn);
}
}
}
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
// The read path doesn't curently support serving the same page at different LSNs.
// While technically possible, it's uncertain if the complexity is worth it.
// Break the batch if such a case is encountered.
let same_page_different_lsn = accum_pages.iter().any(|batched| {
batched.req.rel == this_pages[0].req.rel
&& batched.req.blkno == this_pages[0].req.blkno
&& batched.effective_request_lsn
!= this_pages[0].effective_request_lsn
});
if same_page_different_lsn {
trace!(
rel=%this_pages[0].req.rel,
blkno=%this_pages[0].req.blkno,
lsn=%this_pages[0].effective_request_lsn,
"stopping batching because same page was requested at different LSNs"
);
return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
}
}
}
None
}
#[cfg(feature = "testing")]
(
BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
},
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) => {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return Some(GetPageBatchBreakReason::BatchFull);
}
if !accum_shard.is_same_handle_as(this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return Some(GetPageBatchBreakReason::NonUniformTimeline);
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return Some(GetPageBatchBreakReason::NonUniformKey);
}
None
}
(_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
}
}
}
impl PageServerHandler {
@@ -1140,32 +1019,34 @@ impl PageServerHandler {
.await?;
// We're holding the Handle
let effective_request_lsn = match Self::effective_request_lsn(
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
let res = Self::wait_or_get_last_lsn(
&shard,
shard.get_last_record_lsn(),
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
) {
&ctx,
)
.maybe_perf_instrument(&ctx, |current_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: current_perf_span,
"WAIT_LSN",
)
})
.await;
let effective_request_lsn = match res {
Ok(lsn) => lsn,
Err(e) => {
return respond_error!(span, e);
}
};
BatchedFeMessage::GetPage {
span,
shard: shard.downgrade(),
pages: smallvec::smallvec![BatchedGetPageRequest {
req,
timer,
effective_request_lsn,
ctx,
}],
// The executor grabs the batch when it becomes idle.
// Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
// default reason for breaking the batch.
batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
effective_request_lsn,
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
}
}
#[cfg(feature = "testing")]
@@ -1191,7 +1072,6 @@ impl PageServerHandler {
#[instrument(skip_all, level = tracing::Level::TRACE)]
#[allow(clippy::boxed_local)]
fn pagestream_do_batch(
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
max_batch_size: NonZeroUsize,
batch: &mut Result<BatchedFeMessage, QueryError>,
this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1203,58 +1083,89 @@ impl PageServerHandler {
Err(e) => return Err(Err(e)),
};
let eligible_batch = match batch {
Ok(b) => b,
Err(_) => {
return Err(Ok(this_msg));
}
};
let batch_break =
eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
match batch_break {
Some(reason) => {
if let BatchedFeMessage::GetPage {
batch_break_reason, ..
} = eligible_batch
{
*batch_break_reason = reason;
match (&mut *batch, this_msg) {
// something batched already, let's see if we can add this message to the batch
(
Ok(BatchedFeMessage::GetPage {
span: _,
shard: accum_shard,
pages: accum_pages,
effective_request_lsn: accum_lsn,
}),
BatchedFeMessage::GetPage {
span: _,
shard: this_shard,
pages: this_pages,
effective_request_lsn: this_lsn,
},
) if (|| {
assert_eq!(this_pages.len(), 1);
if accum_pages.len() >= max_batch_size.get() {
trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_pages.len(), max_batch_size.get());
return false;
}
Err(Ok(this_msg))
}
None => {
if !accum_shard.is_same_handle_as(&this_shard) {
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
// the vectored get currently only supports a single LSN, so, bounce as soon
// as the effective request_lsn changes
if *accum_lsn != this_lsn {
trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
return false;
}
true
})() =>
{
// ok to batch
match (eligible_batch, this_msg) {
(
BatchedFeMessage::GetPage {
pages: accum_pages, ..
},
BatchedFeMessage::GetPage {
pages: this_pages, ..
},
) => {
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
BatchedFeMessage::Test {
requests: accum_requests,
..
},
BatchedFeMessage::Test {
requests: this_requests,
..
},
) => {
accum_requests.extend(this_requests);
Ok(())
}
// Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
_ => unreachable!(),
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
Ok(BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
}),
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) if (|| {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return false;
}
if !accum_shard.is_same_handle_as(&this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return false;
}
true
})() =>
{
// ok to batch
accum_requests.extend(this_requests);
Ok(())
}
// something batched already but this message is unbatchable
(_, this_msg) => {
// by default, don't continue batching
Err(Ok(this_msg))
}
}
}
@@ -1476,8 +1387,8 @@ impl PageServerHandler {
BatchedFeMessage::GetPage {
span,
shard,
effective_request_lsn,
pages,
batch_break_reason,
} => {
fail::fail_point!("ps::handle-pagerequest-message::getpage");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
@@ -1488,9 +1399,9 @@ impl PageServerHandler {
let res = self
.handle_get_page_at_lsn_request_batched(
&shard,
effective_request_lsn,
pages,
io_concurrency,
batch_break_reason,
&ctx,
)
.instrument(span.clone())
@@ -1807,7 +1718,6 @@ impl PageServerHandler {
let PageServicePipeliningConfigPipelined {
max_batch_size,
execution,
batching: batching_strategy,
} = pipelining_config;
// Macro to _define_ a pipeline stage.
@@ -1859,7 +1769,7 @@ impl PageServerHandler {
exit |= read_res.is_err();
let could_send = batch_tx
.send(read_res, |batch, res| {
Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
Self::pagestream_do_batch(max_batch_size, batch, res)
})
.await;
exit |= could_send.is_err();
@@ -1955,39 +1865,7 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<Lsn, PageStreamError> {
let last_record_lsn = timeline.get_last_record_lsn();
let effective_request_lsn = Self::effective_request_lsn(
timeline,
last_record_lsn,
request_lsn,
not_modified_since,
latest_gc_cutoff_lsn,
)?;
if effective_request_lsn > last_record_lsn {
timeline
.wait_lsn(
not_modified_since,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
ctx,
)
.await?;
// Since we waited for 'effective_request_lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
}
Ok(effective_request_lsn)
}
fn effective_request_lsn(
timeline: &Timeline,
last_record_lsn: Lsn,
request_lsn: Lsn,
not_modified_since: Lsn,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<Lsn, PageStreamError> {
// Sanity check the request
if request_lsn < not_modified_since {
return Err(PageStreamError::BadRequest(
@@ -2022,7 +1900,19 @@ impl PageServerHandler {
}
}
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
if not_modified_since > last_record_lsn {
timeline
.wait_lsn(
not_modified_since,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
ctx,
)
.await?;
// Since we waited for 'not_modified_since' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
Ok(not_modified_since)
} else {
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2177,16 +2067,16 @@ impl PageServerHandler {
async fn handle_get_page_at_lsn_request_batched(
&mut self,
timeline: &Timeline,
effective_lsn: Lsn,
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
io_concurrency: IoConcurrency,
batch_break_reason: GetPageBatchBreakReason,
ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
debug_assert_current_span_has_tenant_and_timeline_id();
timeline
.query_metrics
.observe_getpage_batch_start(requests.len(), batch_break_reason);
.observe_getpage_batch_start(requests.len());
// If a page trace is running, submit an event for this request.
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
@@ -2196,81 +2086,20 @@ impl PageServerHandler {
// Ignore error (trace buffer may be full or tracer may have disconnected).
_ = page_trace.try_send(PageTraceEvent {
key,
effective_lsn: batch.effective_request_lsn,
effective_lsn,
time,
});
}
}
// If any request in the batch needs to wait for LSN, then do so now.
let mut perf_instrument = false;
let max_effective_lsn = requests
.iter()
.map(|req| {
if req.ctx.has_perf_span() {
perf_instrument = true;
}
req.effective_request_lsn
})
.max()
.expect("batch is never empty");
let ctx = match perf_instrument {
true => RequestContextBuilder::from(ctx)
.root_perf_span(|| {
info_span!(
target: PERF_TRACE_TARGET,
"GET_VECTORED",
tenant_id = %timeline.tenant_shard_id.tenant_id,
timeline_id = %timeline.timeline_id,
shard = %timeline.tenant_shard_id.shard_slug(),
%max_effective_lsn
)
})
.attached_child(),
false => ctx.attached_child(),
};
let last_record_lsn = timeline.get_last_record_lsn();
if max_effective_lsn > last_record_lsn {
if let Err(e) = timeline
.wait_lsn(
max_effective_lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
&ctx,
)
.maybe_perf_instrument(&ctx, |current_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: current_perf_span,
"WAIT_LSN",
)
})
.await
{
return Vec::from_iter(requests.into_iter().map(|req| {
Err(BatchedPageStreamError {
err: PageStreamError::from(e.clone()),
req: req.req.hdr,
})
}));
}
}
let results = timeline
.get_rel_page_at_lsn_batched(
requests.iter().map(|p| {
(
&p.req.rel,
&p.req.blkno,
p.effective_request_lsn,
p.ctx.attached_child(),
)
}),
requests
.iter()
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
effective_lsn,
io_concurrency,
&ctx,
ctx,
)
.await;
assert_eq!(results.len(), requests.len());
@@ -2838,7 +2667,7 @@ where
) -> Result<(), QueryError> {
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
// which requires auth to be present
let data: TokenData<Claims> = self
let data = self
.auth
.as_ref()
.unwrap()

Some files were not shown because too many files have changed in this diff Show More