Compare commits

..

3 Commits

Author SHA1 Message Date
Vlad Lazar
fd5aa06e28 wal_decoder: add a comment to clarify the intention 2024-10-31 18:14:33 +01:00
Vlad Lazar
1cdebae470 wal_decoder: no-op on RM_BTREE_ID 2024-10-31 18:01:09 +01:00
Vlad Lazar
dc7004ec10 wal_decoder: add rate limited log for unexpecte rm ids
Problem

We would like to handle the tightening of unexpected WAL record
types, but don't know if this happens in the field.

Summary of Changes

Add a rate-limited WAL log on such events. The rate limiting is to
play it safe and guard against spewing in prod.
2024-10-31 13:49:48 +01:00
173 changed files with 2992 additions and 5949 deletions

View File

@@ -221,8 +221,6 @@ runs:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL, COMMIT_SHA } = process.env

View File

@@ -0,0 +1,36 @@
name: "Set custom docker config directory"
description: "Create a directory for docker config and set DOCKER_CONFIG"
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
runs:
using: "composite"
steps:
- name: Show warning on GitHub-hosted runners
if: runner.environment == 'github-hosted'
shell: bash -euo pipefail {0}
run: |
# Using the following environment variables to find a path to the workflow file
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
# ${GITHUB_REPOSITORY} - octocat/hello-world
# ${GITHUB_REF} - refs/heads/my_branch
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
filename=${filename_with_ref%"@$GITHUB_REF"}
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
echo "::warning file=${filename},title=${title}::${message}"
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
env:
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
with:
main: |
mkdir -p "${DOCKER_CONFIG}"
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
post: |
if [ -d "${DOCKER_CONFIG}" ]; then
rm -r "${DOCKER_CONFIG}"
fi

View File

@@ -1,37 +0,0 @@
name: Check Codestyle Python
on:
workflow_call:
inputs:
build-tools-image:
description: 'build-tools image'
required: true
type: string
defaults:
run:
shell: bash -euxo pipefail {0}
jobs:
check-codestyle-python:
runs-on: [ self-hosted, small ]
container:
image: ${{ inputs.build-tools-image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
- run: ./scripts/pysync
- run: poetry run ruff check .
- run: poetry run ruff format --check .
- run: poetry run mypy .

View File

@@ -64,7 +64,7 @@ jobs:
- uses: actions/checkout@v4
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false

View File

@@ -90,10 +90,35 @@ jobs:
check-codestyle-python:
needs: [ check-permissions, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
secrets: inherit
runs-on: [ self-hosted, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Cache poetry deps
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Run `ruff check` to ensure code format
run: poetry run ruff check .
- name: Run `ruff format` to ensure code format
run: poetry run ruff format --check .
- name: Run mypy to check types
run: poetry run mypy .
check-codestyle-jsonnet:
needs: [ check-permissions, build-build-tools-image ]
@@ -116,7 +141,6 @@ jobs:
# Check that the vendor/postgres-* submodules point to the
# corresponding REL_*_STABLE_neon branches.
check-submodules:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
steps:
- name: Checkout
@@ -497,8 +521,6 @@ jobs:
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
@@ -530,7 +552,7 @@ jobs:
with:
submodules: true
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -621,7 +643,7 @@ jobs:
with:
submodules: true
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -802,7 +824,7 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -838,7 +860,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

View File

@@ -201,8 +201,6 @@ jobs:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL, SHA } = process.env

View File

@@ -1,94 +0,0 @@
name: Pre-merge checks
on:
merge_group:
branches:
- main
defaults:
run:
shell: bash -euxo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
get-changed-files:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
steps:
- uses: actions/checkout@v4
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: python-src
with:
files: |
.github/workflows/pre-merge-checks.yml
**/**.py
poetry.lock
pyproject.toml
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
env:
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
run: |
echo "${PYTHON_CHANGED_FILES}"
check-build-tools-image:
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
check-codestyle-python:
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
secrets: inherit
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
# Currently we require 2 jobs (checks with exact name):
# - conclusion
# - neon-cloud-e2e
conclusion:
if: always()
permissions:
statuses: write # for `github.repos.createCommitStatus(...)`
needs:
- get-changed-files
- check-codestyle-python
runs-on: ubuntu-22.04
steps:
- name: Create fake `neon-cloud-e2e` check
uses: actions/github-script@v7
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { repo, owner } = context.repo;
const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
await github.rest.repos.createCommitStatus({
owner: owner,
repo: repo,
sha: context.sha,
context: `neon-cloud-e2e`,
state: `success`,
target_url: targetUrl,
description: `fake check for merge queue`,
});
- name: Fail the job if any of the dependencies do not succeed or skipped
run: exit 1
if: |
(contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
|| contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')

View File

@@ -23,7 +23,6 @@ on:
- Test Postgres client libraries
- Trigger E2E Tests
- cleanup caches by a branch
- Pre-merge checks
types: [completed]
jobs:

159
Cargo.lock generated
View File

@@ -310,6 +310,33 @@ dependencies = [
"zeroize",
]
[[package]]
name = "aws-lc-rs"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
dependencies = [
"aws-lc-sys",
"mirai-annotations",
"paste",
"zeroize",
]
[[package]]
name = "aws-lc-sys"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
dependencies = [
"bindgen 0.69.5",
"cc",
"cmake",
"dunce",
"fs_extra",
"libc",
"paste",
]
[[package]]
name = "aws-runtime"
version = "1.4.3"
@@ -915,6 +942,29 @@ dependencies = [
"serde",
]
[[package]]
name = "bindgen"
version = "0.69.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
dependencies = [
"bitflags 2.4.1",
"cexpr",
"clang-sys",
"itertools 0.10.5",
"lazy_static",
"lazycell",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 2.0.52",
"which",
]
[[package]]
name = "bindgen"
version = "0.70.1"
@@ -924,7 +974,7 @@ dependencies = [
"bitflags 2.4.1",
"cexpr",
"clang-sys",
"itertools 0.12.1",
"itertools 0.10.5",
"log",
"prettyplease",
"proc-macro2",
@@ -1170,6 +1220,15 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "cmake"
version = "0.1.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
dependencies = [
"cc",
]
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -1270,9 +1329,9 @@ dependencies = [
[[package]]
name = "const-oid"
version = "0.9.6"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
[[package]]
name = "const-random"
@@ -1756,6 +1815,12 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "dunce"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
[[package]]
name = "dyn-clone"
version = "1.0.14"
@@ -2060,6 +2125,12 @@ dependencies = [
"tokio-util",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsevent-sys"
version = "4.1.0"
@@ -2413,6 +2484,15 @@ dependencies = [
"digest",
]
[[package]]
name = "home"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "hostname"
version = "0.4.0"
@@ -2908,6 +2988,12 @@ dependencies = [
"spin",
]
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.150"
@@ -3138,6 +3224,12 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "mirai-annotations"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
[[package]]
name = "multimap"
version = "0.8.3"
@@ -4055,7 +4147,7 @@ dependencies = [
"bytes",
"once_cell",
"pq_proto",
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-pemfile 2.1.1",
"serde",
"thiserror",
@@ -4084,7 +4176,7 @@ name = "postgres_ffi"
version = "0.1.0"
dependencies = [
"anyhow",
"bindgen",
"bindgen 0.70.1",
"bytes",
"crc32c",
"env_logger",
@@ -4222,7 +4314,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
dependencies = [
"bytes",
"heck 0.5.0",
"itertools 0.12.1",
"itertools 0.10.5",
"log",
"multimap",
"once_cell",
@@ -4242,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
dependencies = [
"anyhow",
"itertools 0.12.1",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 2.0.52",
@@ -4330,7 +4422,7 @@ dependencies = [
"rsa",
"rstest",
"rustc-hash",
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
"scopeguard",
@@ -4341,8 +4433,6 @@ dependencies = [
"smallvec",
"smol_str",
"socket2",
"strum",
"strum_macros",
"subtle",
"thiserror",
"tikv-jemalloc-ctl",
@@ -4365,7 +4455,6 @@ dependencies = [
"walkdir",
"workspace_hack",
"x509-parser",
"zerocopy",
]
[[package]]
@@ -4743,7 +4832,6 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"serde",
@@ -5018,22 +5106,23 @@ dependencies = [
"log",
"ring",
"rustls-pki-types",
"rustls-webpki 0.102.8",
"rustls-webpki 0.102.2",
"subtle",
"zeroize",
]
[[package]]
name = "rustls"
version = "0.23.16"
version = "0.23.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
dependencies = [
"aws-lc-rs",
"log",
"once_cell",
"ring",
"rustls-pki-types",
"rustls-webpki 0.102.8",
"rustls-webpki 0.102.2",
"subtle",
"zeroize",
]
@@ -5113,10 +5202,11 @@ dependencies = [
[[package]]
name = "rustls-webpki"
version = "0.102.8"
version = "0.102.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
dependencies = [
"aws-lc-rs",
"ring",
"rustls-pki-types",
"untrusted",
@@ -5147,7 +5237,6 @@ dependencies = [
"chrono",
"clap",
"crc32c",
"criterion",
"desim",
"fail",
"futures",
@@ -5155,7 +5244,6 @@ dependencies = [
"http 1.1.0",
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -5735,7 +5823,6 @@ dependencies = [
"once_cell",
"parking_lot 0.12.1",
"prost",
"rustls 0.23.16",
"tokio",
"tonic",
"tonic-build",
@@ -5818,7 +5905,7 @@ dependencies = [
"postgres_ffi",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-native-certs 0.8.0",
"serde",
"serde_json",
@@ -6251,7 +6338,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
dependencies = [
"ring",
"rustls 0.23.16",
"rustls 0.23.7",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.0",
@@ -6285,7 +6372,7 @@ version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
dependencies = [
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-pki-types",
"tokio",
]
@@ -6694,7 +6781,7 @@ dependencies = [
"base64 0.22.1",
"log",
"once_cell",
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-pki-types",
"url",
"webpki-roots 0.26.1",
@@ -6875,6 +6962,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"once_cell",
"pageserver_api",
"postgres_ffi",
"serde",
@@ -6898,7 +6986,7 @@ name = "walproposer"
version = "0.1.0"
dependencies = [
"anyhow",
"bindgen",
"bindgen 0.70.1",
"postgres_ffi",
"utils",
]
@@ -7073,6 +7161,18 @@ dependencies = [
"rustls-pki-types",
]
[[package]]
name = "which"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
dependencies = [
"either",
"home",
"once_cell",
"rustix",
]
[[package]]
name = "whoami"
version = "1.5.1"
@@ -7332,7 +7432,7 @@ dependencies = [
"hyper-util",
"indexmap 1.9.3",
"indexmap 2.0.1",
"itertools 0.12.1",
"itertools 0.10.5",
"lazy_static",
"libc",
"log",
@@ -7353,7 +7453,8 @@ dependencies = [
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"reqwest 0.12.4",
"rustls 0.23.16",
"rustls 0.23.7",
"rustls-webpki 0.102.2",
"scopeguard",
"serde",
"serde_json",
@@ -7378,7 +7479,6 @@ dependencies = [
"tracing",
"tracing-core",
"url",
"zerocopy",
"zeroize",
"zstd",
"zstd-safe",
@@ -7451,7 +7551,6 @@ version = "0.7.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
dependencies = [
"byteorder",
"zerocopy-derive",
]

View File

@@ -143,7 +143,7 @@ reqwest-retry = "0.5"
routerify = "3"
rpds = "0.13"
rustc-hash = "1.1.0"
rustls = { version = "0.23.16", default-features = false }
rustls = "0.23"
rustls-pemfile = "2"
scopeguard = "1.1"
sysinfo = "0.29.2"
@@ -174,7 +174,7 @@ tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
tokio-rustls = "0.26"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -196,7 +196,6 @@ walkdir = "2.3.2"
rustls-native-certs = "0.8"
x509-parser = "0.16"
whoami = "1.5.1"
zerocopy = { version = "0.7", features = ["derive"] }
## TODO replace this with tracing
env_logger = "0.10"

View File

@@ -1,66 +1,12 @@
ARG DEBIAN_VERSION=bullseye
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
set -e && \
apt update && \
apt install -y --no-install-recommends \
ca-certificates wget gpg && \
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
apt-get update && \
apt install -y --no-install-recommends \
build-essential \
autotools-dev \
libedit-dev \
libgc-dev \
libpam0g-dev \
libreadline-dev \
libselinux1-dev \
libxslt1-dev \
libssl-dev \
libkrb5-dev \
zlib1g-dev \
liblz4-dev \
libpq5 \
libpq-dev \
libzstd-dev \
postgresql-16 \
postgresql-server-dev-16 \
postgresql-common \
python3-sphinx && \
wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
mkdir /tmp/pgcopydb && \
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
cd /tmp/pgcopydb && \
make -s clean && \
make -s -j12 install && \
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
mkdir -p /pgcopydb/lib && \
cp "$libpq_path" /pgcopydb/lib/; \
else \
# copy command below will fail if we don't have dummy files, so we create them for other debian versions
mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
fi
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
FROM debian:${DEBIAN_VERSION}-slim
ARG DEBIAN_VERSION
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
RUN mkdir -p /pgcopydb/bin && \
mkdir -p /pgcopydb/lib && \
chmod -R 755 /pgcopydb && \
chown -R nonroot:nonroot /pgcopydb
COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
# System deps
#
# 'gdb' is included so that we get backtraces of core dumps produced in
@@ -92,7 +38,7 @@ RUN set -e \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
libtool \
libxml2-dev \
libxmlsec1-dev \
@@ -289,13 +235,7 @@ RUN whoami \
&& cargo --version --verbose \
&& rustup --version --verbose \
&& rustc --version --verbose \
&& clang --version
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
else \
echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
fi
&& clang --version
# Set following flag to check in Makefile if its running in Docker
RUN touch /home/nonroot/.docker_build

View File

@@ -431,11 +431,14 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY compute/patches/rum.patch /rum.patch
# supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
# doesn't use releases since 1.3.13 - Sep 19, 2022
# use latest commit from the master branch
RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
# maybe version-specific
# support for v17 is unknown
# last release 1.3.13 - Sep 19, 2022
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
patch -p1 < /rum.patch && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -559,8 +562,8 @@ RUN case "${PG_VERSION}" in \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
"v17") \
export TIMESCALEDB_VERSION=2.17.1 \
export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
export TIMESCALEDB_VERSION=2.17.0 \
export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
;; \
esac && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
@@ -956,31 +959,21 @@ RUN apt-get install -y protobuf-compiler && \
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
# version 0.3.3 supports v17
# last release v0.3.3 - Oct 16, 2024
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_JSONSCHEMA_VERSION=0.3.3 \
export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
RUN case "${PG_VERSION}" in "v17") \
echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
# `unsafe-postgres` feature allows to build pgx extensions
# against postgres forks that decided to change their ABI name (like us).
# With that we can build extensions without forking them and using stock
# pgx. As this feature is new few manual version bumps were required.
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -991,27 +984,16 @@ RUN case "${PG_VERSION}" in \
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
# version 1.5.9 supports v17
# last release v1.5.9 - Oct 16, 2024
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_GRAPHQL_VERSION=1.5.9 \
export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
RUN case "${PG_VERSION}" in "v17") \
echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
esac && \
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -1024,13 +1006,15 @@ RUN case "${PG_VERSION}" in \
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# doesn't use releases
# 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN case "${PG_VERSION}" in "v17") \
echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
# TODO update pgrx version in the pg_tiktoken repo and remove this line
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
@@ -1048,8 +1032,6 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
# doesn't support v17 yet
# https://github.com/pksunkara/pgx_ulid/pull/52
RUN case "${PG_VERSION}" in "v17") \
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
esac && \
@@ -1067,16 +1049,16 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
FROM rust-extensions-build AS pg-session-jwt-build
ARG PG_VERSION
# NOTE: local_proxy depends on the version of pg_session_jwt
# Do not update without approve from proxy team
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in "v17") \
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release
#########################################################################################
@@ -1151,8 +1133,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# The topmost commit in the `neon` branch at the time of writing this
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \

View File

@@ -3,7 +3,7 @@
metrics: [
import 'sql_exporter/checkpoints_req.libsonnet',
import 'sql_exporter/checkpoints_timed.libsonnet',
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
import 'sql_exporter/compute_current_lsn.libsonnet',
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
import 'sql_exporter/compute_receive_lsn.libsonnet',

View File

@@ -1,10 +1,10 @@
{
metric_name: 'compute_backpressure_throttling_seconds',
metric_name: 'compute_backpressure_throttling_ms',
type: 'gauge',
help: 'Time compute has spent throttled',
key_labels: null,
values: [
'throttled',
],
query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
}

View File

@@ -0,0 +1 @@
SELECT neon.backpressure_throttling_time() AS throttled;

View File

@@ -1 +0,0 @@
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;

View File

@@ -364,29 +364,11 @@ impl ComputeNode {
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
Lsn(0) => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id
)
} else {
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
}
}
_ => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id, lsn
)
} else {
format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
)
}
}
Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
_ => format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
),
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;

View File

@@ -73,12 +73,6 @@ pub fn write_postgres_conf(
)?;
}
// Locales
writeln!(file, "lc_messages='C.UTF-8'")?;
writeln!(file, "lc_monetary='C.UTF-8'")?;
writeln!(file, "lc_time='C.UTF-8'")?;
writeln!(file, "lc_numeric='C.UTF-8'")?;
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {

View File

@@ -944,9 +944,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
// Typical developer machines use disks with slow fsync, and we don't care
// about data integrity: disable disk syncs.
no_sync: true,
}
})
.collect(),

View File

@@ -225,7 +225,6 @@ pub struct PageServerConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
}
impl Default for PageServerConf {
@@ -236,7 +235,6 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
}
}
}
@@ -251,8 +249,6 @@ pub struct NeonLocalInitPageserverConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub no_sync: bool,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
@@ -265,7 +261,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
other: _,
} = conf;
Self {
@@ -274,7 +269,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
}
}
}
@@ -575,8 +569,6 @@ impl LocalEnv {
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
no_sync: bool,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -599,7 +591,6 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
} = config_toml;
let IdentityTomlSubset {
id: identity_toml_id,
@@ -616,7 +607,6 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
};
pageservers.push(conf);
}

View File

@@ -273,7 +273,6 @@ impl PageServerNode {
)
})?;
let args = vec!["-D", datadir_path_str];
background_process::start_process(
"pageserver",
&datadir,
@@ -335,20 +334,17 @@ impl PageServerNode {
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
.transpose()?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
.transpose()?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
.transpose()?,
compaction_algorithm: settings
.remove("compaction_algorithm")
.map(serde_json::from_str)
@@ -357,19 +353,16 @@ impl PageServerNode {
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
.transpose()?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
.transpose()?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
.transpose()?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
@@ -410,11 +403,6 @@ impl PageServerNode {
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
timeline_offloading: settings
.remove("timeline_offloading")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'timeline_offloading' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -426,9 +414,97 @@ impl PageServerNode {
pub async fn tenant_config(
&self,
tenant_id: TenantId,
settings: HashMap<&str, &str>,
mut settings: HashMap<&str, &str>,
) -> anyhow::Result<()> {
let config = Self::parse_config(settings)?;
let config = {
// Braces to make the diff easier to read
models::TenantConfig {
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_algorithm: settings
.remove("compactin_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings
.remove("lagging_wal_timeout")
.map(|x| x.to_string()),
max_lsn_wal_lag: settings
.remove("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
eviction_policy: settings
.remove("eviction_policy")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings
.remove("min_resident_size_override")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'min_resident_size_override' as an integer")?,
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
}
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_client
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
.await?;

View File

@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
There are two places we need to read the aux files from the pageserver:
* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
## Compaction and Image Layer Generation

View File

@@ -110,23 +110,6 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
&[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
/// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end
/// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32.
pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
assert_ne!(start, 0);
assert!(start <= end);
let start = match start.checked_next_power_of_two() {
Some(n) if n == start => n, // start already power of two
Some(n) => n >> 1, // power of two below start
None => panic!("start too large"),
};
let end = end.checked_next_power_of_two().expect("end too large");
std::iter::successors(Some(start), |n| n.checked_mul(2))
.take_while(|n| n <= &end)
.map(|n| n as f64)
.collect()
}
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
@@ -612,67 +595,3 @@ where
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1);
#[test]
fn pow2_buckets_cases() {
assert_eq!(pow2_buckets(1, 1), vec![1.0]);
assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]);
assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]);
assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]);
assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(
pow2_buckets(1, 200),
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0]
);
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(8, 8), vec![8.0]);
assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]);
// Largest valid values.
assert_eq!(
pow2_buckets(1, POW2_BUCKETS_MAX).len(),
usize::BITS as usize
);
assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1);
}
#[test]
#[should_panic]
fn pow2_buckets_zero_start() {
pow2_buckets(0, 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_lt_start() {
pow2_buckets(2, 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_overflow_min() {
pow2_buckets(1, POW2_BUCKETS_MAX + 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_overflow_max() {
pow2_buckets(1, usize::MAX);
}
}

View File

@@ -64,7 +64,6 @@ pub struct ConfigToml {
#[serde(with = "humantime_serde")]
pub wal_redo_timeout: Duration,
pub superuser: String,
pub locale: String,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -107,8 +106,6 @@ pub struct ConfigToml {
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: Option<crate::models::L0FlushConfig>,
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
#[serde(skip_serializing_if = "Option::is_none")]
pub no_sync: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -262,10 +259,6 @@ pub struct TenantConfigToml {
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
pub lsn_lease_length_for_ts: Duration,
/// Enable auto-offloading of timelines.
/// (either this flag or the pageserver-global one need to be set)
pub timeline_offloading: bool,
}
pub mod defaults {
@@ -277,7 +270,6 @@ pub mod defaults {
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
pub const DEFAULT_LOCALE: &str = "C.UTF-8";
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -328,7 +320,6 @@ impl Default for ConfigToml {
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
.expect("cannot parse default wal redo timeout")),
superuser: (DEFAULT_SUPERUSER.to_string()),
locale: DEFAULT_LOCALE.to_string(),
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
@@ -394,7 +385,6 @@ impl Default for ConfigToml {
l0_flush: None,
virtual_file_io_mode: None,
tenant_config: TenantConfigToml::default(),
no_sync: None,
}
}
}
@@ -481,7 +471,6 @@ impl Default for TenantConfigToml {
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: false,
}
}
}

View File

@@ -310,7 +310,6 @@ pub struct TenantConfig {
pub image_layer_creation_check_threshold: Option<u8>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
pub timeline_offloading: Option<bool>,
}
/// The policy for the aux file storage.

View File

@@ -2,7 +2,7 @@
use once_cell::sync::Lazy;
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
use pq_proto::{BeMessage, RowDescriptor};
use rustls::crypto::ring;
use rustls::crypto::aws_lc_rs;
use std::io::Cursor;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -94,7 +94,7 @@ async fn simple_select_ssl() {
let (client_sock, server_sock) = make_tcp_pair().await;
let server_cfg =
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("aws_lc_rs should support the default protocol versions")
.with_no_client_auth()
@@ -110,7 +110,7 @@ async fn simple_select_ssl() {
});
let client_cfg =
rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("aws_lc_rs should support the default protocol versions")
.with_root_certificates({

View File

@@ -170,6 +170,7 @@ pub const RM_RELMAP_ID: u8 = 7;
pub const RM_STANDBY_ID: u8 = 8;
pub const RM_HEAP2_ID: u8 = 9;
pub const RM_HEAP_ID: u8 = 10;
pub const RM_BTREE_ID: u8 = 11;
pub const RM_REPLORIGIN_ID: u8 = 19;
pub const RM_LOGICALMSG_ID: u8 = 21;

View File

@@ -1,10 +1,10 @@
use std::ffi::{CStr, CString};
use std::ffi::CStr;
use bytes::{Bytes, BytesMut};
use crc32c::crc32c_append;
use utils::lsn::Lsn;
use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
use super::xlog_utils::{
XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
XLP_FIRST_IS_CONTRECORD,
@@ -16,65 +16,11 @@ use crate::pg_constants::{
};
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
pub struct Record {
pub rmid: RmgrId,
pub info: u8,
pub data: Bytes,
}
impl Record {
/// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
/// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
// Prefix data with block ID and length.
let data_header = Bytes::from(match self.data.len() {
0 => vec![],
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
256.. => {
let len_bytes = (self.data.len() as u32).to_le_bytes();
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
}
});
// Construct the WAL record header.
let mut header = XLogRecord {
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
xl_xid: 0,
xl_prev: prev_lsn.into(),
xl_info: self.info,
xl_rmid: self.rmid,
__bindgen_padding_0: [0; 2],
xl_crc: 0, // see below
};
// Compute the CRC checksum for the data, and the header up to the CRC field.
let mut crc = 0;
crc = crc32c_append(crc, &data_header);
crc = crc32c_append(crc, &self.data);
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
header.xl_crc = crc;
// Encode the final header and record.
let header = header.encode().unwrap();
[header, data_header, self.data.clone()].concat().into()
}
}
/// Generates WAL record payloads.
///
/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
/// that creates a table and inserts rows.
pub trait RecordGenerator: Iterator<Item = Record> {}
impl<I: Iterator<Item = Record>> RecordGenerator for I {}
/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
/// including internal page headers if it spans pages. Concatenating the bytes will yield a
/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
/// for performance.
/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
/// boundaries if desired. Not optimized for performance.
///
/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
@@ -85,10 +31,10 @@ impl<I: Iterator<Item = Record>> RecordGenerator for I {}
/// | Segment 1 | Segment 2 | Segment 3 |
/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
/// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 |
///
/// TODO: support generating actual tables and rows.
#[derive(Default)]
pub struct WalGenerator<R: RecordGenerator> {
/// Generates record payloads for the WAL.
pub record_generator: R,
pub struct WalGenerator {
/// Current LSN to append the next record at.
///
/// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
@@ -100,35 +46,73 @@ pub struct WalGenerator<R: RecordGenerator> {
pub prev_lsn: Lsn,
}
impl<R: RecordGenerator> WalGenerator<R> {
// Hardcode the sys and timeline ID. We can make them configurable if we care about them.
impl WalGenerator {
// For now, hardcode the message payload.
// TODO: support specifying the payload size.
const PREFIX: &CStr = c"prefix";
const MESSAGE: &[u8] = b"message";
// Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
const SYS_ID: u64 = 0;
const TIMELINE_ID: u32 = 1;
const DB_ID: u32 = 0;
/// Creates a new WAL generator with the given record generator.
pub fn new(record_generator: R) -> WalGenerator<R> {
Self {
record_generator,
lsn: Lsn(0),
prev_lsn: Lsn(0),
}
/// Creates a new WAL generator, which emits logical message records (noops).
pub fn new() -> Self {
Self::default()
}
/// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
/// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
let record = record.encode(self.prev_lsn);
let record = Self::insert_pages(record, self.lsn);
let record = Self::pad_record(record, self.lsn);
let lsn = self.lsn;
self.prev_lsn = self.lsn;
self.lsn += record.len() as u64;
(lsn, record)
/// Encodes a logical message (basically a noop), with the given prefix and message.
pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
let prefix = prefix.to_bytes_with_nul();
let header = XlLogicalMessage {
db_id: Self::DB_ID,
transactional: 0,
prefix_size: prefix.len() as u64,
message_size: message.len() as u64,
};
[&header.encode(), prefix, message].concat().into()
}
/// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
/// Encode a WAL record with the given payload data (e.g. a logical message).
pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
// Prefix data with block ID and length.
let data_header = Bytes::from(match data.len() {
0 => vec![],
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
256.. => {
let len_bytes = (data.len() as u32).to_le_bytes();
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
}
});
// Construct the WAL record header.
let mut header = XLogRecord {
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
xl_xid: 0,
xl_prev: prev_lsn.into(),
xl_info: info,
xl_rmid: rmid,
__bindgen_padding_0: [0; 2],
xl_crc: 0, // see below
};
// Compute the CRC checksum for the data, and the header up to the CRC field.
let mut crc = 0;
crc = crc32c_append(crc, &data_header);
crc = crc32c_append(crc, &data);
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
header.xl_crc = crc;
// Encode the final header and record.
let header = header.encode().unwrap();
[header, data_header, data].concat().into()
}
/// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
/// is to be appended.
fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
// Fast path: record fits in current page, and the page already has a header.
if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
return record;
@@ -189,71 +173,31 @@ impl<R: RecordGenerator> WalGenerator<R> {
}
[record, Bytes::from(vec![0; padding])].concat().into()
}
/// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
let record = Self::encode_record(data, rmid, info, self.prev_lsn);
let record = Self::encode_pages(record, self.lsn);
let record = Self::pad_record(record, self.lsn);
self.prev_lsn = self.lsn;
self.lsn += record.len() as u64;
record
}
/// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
let data = Self::encode_logical_message(prefix, message);
self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
}
}
/// Generates WAL records as an iterator.
impl<R: RecordGenerator> Iterator for WalGenerator<R> {
/// Generate WAL records as an iterator.
impl Iterator for WalGenerator {
type Item = (Lsn, Bytes);
fn next(&mut self) -> Option<Self::Item> {
let record = self.record_generator.next()?;
Some(self.append_record(record))
}
}
/// Generates logical message records (effectively noops) with a fixed message.
pub struct LogicalMessageGenerator {
prefix: CString,
message: Vec<u8>,
}
impl LogicalMessageGenerator {
const DB_ID: u32 = 0; // hardcoded for now
const RM_ID: RmgrId = RM_LOGICALMSG_ID;
const INFO: u8 = XLOG_LOGICAL_MESSAGE;
/// Creates a new LogicalMessageGenerator.
pub fn new(prefix: &CStr, message: &[u8]) -> Self {
Self {
prefix: prefix.to_owned(),
message: message.to_owned(),
}
}
/// Encodes a logical message.
fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
let prefix = prefix.to_bytes_with_nul();
let header = XlLogicalMessage {
db_id: Self::DB_ID,
transactional: 0,
prefix_size: prefix.len() as u64,
message_size: message.len() as u64,
};
[&header.encode(), prefix, message].concat().into()
}
}
impl Iterator for LogicalMessageGenerator {
type Item = Record;
fn next(&mut self) -> Option<Self::Item> {
Some(Record {
rmid: Self::RM_ID,
info: Self::INFO,
data: Self::encode(&self.prefix, &self.message),
})
}
}
impl WalGenerator<LogicalMessageGenerator> {
/// Convenience method for appending a WAL record with an arbitrary logical message at the
/// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
let record = Record {
rmid: LogicalMessageGenerator::RM_ID,
info: LogicalMessageGenerator::INFO,
data: LogicalMessageGenerator::encode(prefix, message),
};
self.append_record(record)
let lsn = self.lsn;
let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
Some((lsn, record))
}
}

View File

@@ -12,9 +12,9 @@ use super::bindings::{
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
};
use super::wal_generator::LogicalMessageGenerator;
use super::wal_generator::WalGenerator;
use super::PG_MAJORVERSION;
use crate::pg_constants;
use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
use crate::PG_TLI;
use crate::{uint32, uint64, Oid};
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -493,10 +493,12 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
// This function can take untrusted input, so discard any NUL bytes in the prefix string.
let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
let message = message.as_bytes();
LogicalMessageGenerator::new(&prefix, message)
.next()
.unwrap()
.encode(Lsn(0))
WalGenerator::encode_record(
WalGenerator::encode_logical_message(&prefix, message),
RM_LOGICALMSG_ID,
XLOG_LOGICAL_MESSAGE,
Lsn(0),
)
}
#[cfg(test)]

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -euxo pipefail
@@ -6,44 +6,9 @@ PG_BIN=$1
WAL_PATH=$2
DATA_DIR=$3
PORT=$4
PG_VERSION=$5
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
# The way that initdb is invoked must match how the pageserver runs initdb.
function initdb_with_args {
local cmd=(
"$PG_BIN"/initdb
-E utf8
-U cloud_admin
-D "$DATA_DIR"
--locale 'C.UTF-8'
--lc-collate 'C.UTF-8'
--lc-ctype 'C.UTF-8'
--lc-messages 'C.UTF-8'
--lc-monetary 'C.UTF-8'
--lc-numeric 'C.UTF-8'
--lc-time 'C.UTF-8'
--sysid="$SYSID"
)
case "$PG_VERSION" in
14)
# Postgres 14 and below didn't support --locale-provider
;;
15 | 16)
cmd+=(--locale-provider 'libc')
;;
*)
# Postgres 17 added the builtin provider
cmd+=(--locale-provider 'builtin')
;;
esac
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
}
rm -fr "$DATA_DIR"
initdb_with_args
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)

View File

@@ -40,11 +40,6 @@ pub enum Scope {
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
/// of a tenant & post scrub results.
Scrubber,
/// This scope is used for communication with other storage controller instances.
/// At the time of writing, this is only used for the step down request.
#[serde(rename = "controller_peer")]
ControllerPeer,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -5,11 +5,12 @@ edition.workspace = true
license.workspace = true
[features]
testing = ["pageserver_api/testing"]
testing = []
[dependencies]
anyhow.workspace = true
bytes.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
postgres_ffi.workspace = true
serde.workspace = true

View File

@@ -2,13 +2,15 @@
//! raw bytes which represent a raw Postgres WAL record.
use crate::models::*;
use crate::serialized_batch::SerializedValueBatch;
use bytes::{Buf, Bytes};
use bytes::{Buf, Bytes, BytesMut};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::pg_constants;
use pageserver_api::value::Value;
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
use postgres_ffi::walrecord::*;
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
use utils::lsn::Lsn;
impl InterpretedWalRecord {
@@ -19,12 +21,11 @@ impl InterpretedWalRecord {
pub fn from_bytes_filtered(
buf: Bytes,
shard: &ShardIdentity,
record_end_lsn: Lsn,
lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<InterpretedWalRecord> {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(buf, &mut decoded, pg_version)?;
let xid = decoded.xl_xid;
let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
FlushUncommittedRecords::Yes
@@ -32,20 +33,96 @@ impl InterpretedWalRecord {
FlushUncommittedRecords::No
};
let metadata_record = MetadataRecord::from_decoded(&decoded, record_end_lsn, pg_version)?;
let batch = SerializedValueBatch::from_decoded_filtered(
decoded,
shard,
record_end_lsn,
pg_version,
)?;
let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
let mut blocks = Vec::default();
for blk in decoded.blocks.iter() {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !key.is_valid_key_on_write_path() {
anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
}
let key_is_local = shard.is_key_local(&key);
tracing::debug!(
lsn=%lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
blocks.push((key.to_compact(), None));
}
continue;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let value = if blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
blocks.push((key.to_compact(), Some(value)));
}
Ok(InterpretedWalRecord {
metadata_record,
batch,
end_lsn: record_end_lsn,
blocks,
lsn,
flush_uncommitted,
xid,
xid: decoded.xl_xid,
})
}
}
@@ -53,7 +130,7 @@ impl InterpretedWalRecord {
impl MetadataRecord {
fn from_decoded(
decoded: &DecodedWALRecord,
record_end_lsn: Lsn,
lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<Option<MetadataRecord>> {
// Note: this doesn't actually copy the bytes since
@@ -74,7 +151,7 @@ impl MetadataRecord {
Ok(None)
}
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, record_end_lsn),
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
pg_constants::RM_MULTIXACT_ID => {
Self::decode_multixact_record(&mut buf, decoded, pg_version)
}
@@ -86,15 +163,35 @@ impl MetadataRecord {
//
// Alternatively, one can make the checkpoint part of the subscription protocol
// to the pageserver. This should work fine, but can be done at a later point.
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, record_end_lsn),
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
pg_constants::RM_LOGICALMSG_ID => {
Self::decode_logical_message_record(&mut buf, decoded)
}
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
_unexpected => {
pg_constants::RM_BTREE_ID => {
// No special handling required for these record types.
// We just ingest the blocks that come with it.
Ok(None)
}
unexpected => {
// TODO: consider failing here instead of blindly doing something without
// understanding the protocol
use once_cell::sync::Lazy;
use std::sync::Mutex;
use std::time::Duration;
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut rate_limit = LOGGED.try_lock().unwrap();
rate_limit.call(|| {
tracing::warn!(
"Unexpected resource manager id in PG WAL record at LSN {}: {}",
lsn,
unexpected
);
});
Ok(None)
}
}

View File

@@ -1,3 +1,2 @@
pub mod decoder;
pub mod models;
pub mod serialized_batch;

View File

@@ -2,8 +2,7 @@
//! ready for the pageserver to interpret. They are derived from the original
//! WAL records, so that each struct corresponds closely to one WAL record of
//! a specific kind. They contain the same information as the original WAL records,
//! but the values are already serialized in a [`SerializedValueBatch`], which
//! is the format that the pageserver is expecting them in.
//! just decoded into structs and fields for easier access.
//!
//! The ingestion code uses these structs to help with parsing the WAL records,
//! and it splits them into a stream of modifications to the key-value pairs that
@@ -26,7 +25,9 @@
//! |--> write to KV store within the pageserver
use bytes::Bytes;
use pageserver_api::key::CompactKey;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::value::Value;
use postgres_ffi::walrecord::{
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
XlSmgrTruncate, XlXactParsedRecord,
@@ -34,8 +35,6 @@ use postgres_ffi::walrecord::{
use postgres_ffi::{Oid, TransactionId};
use utils::lsn::Lsn;
use crate::serialized_batch::SerializedValueBatch;
pub enum FlushUncommittedRecords {
Yes,
No,
@@ -46,11 +45,12 @@ pub struct InterpretedWalRecord {
/// Optional metadata record - may cause writes to metadata keys
/// in the storage engine
pub metadata_record: Option<MetadataRecord>,
/// A pre-serialized batch along with the required metadata for ingestion
/// by the pageserver
pub batch: SerializedValueBatch,
/// Images or deltas for blocks modified in the original WAL record.
/// The [`Value`] is optional to avoid sending superfluous data to
/// shard 0 for relation size tracking.
pub blocks: Vec<(CompactKey, Option<Value>)>,
/// Byte offset within WAL for the end of the original PG WAL record
pub end_lsn: Lsn,
pub lsn: Lsn,
/// Whether to flush all uncommitted modifications to the storage engine
/// before ingesting this record. This is currently only used for legacy PG
/// database creations which read pages from a template database. Such WAL

View File

@@ -1,862 +0,0 @@
//! This module implements batch type for serialized [`pageserver_api::value::Value`]
//! instances. Each batch contains a raw buffer (serialized values)
//! and a list of metadata for each (key, LSN) tuple present in the batch.
//!
//! Such batches are created from decoded PG wal records and ingested
//! by the pageserver by writing directly to the ephemeral file.
use std::collections::BTreeSet;
use bytes::{Bytes, BytesMut};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::{key::CompactKey, value::Value};
use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
use utils::bin_ser::BeSer;
use utils::lsn::Lsn;
use pageserver_api::key::Key;
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
/// Accompanying metadata for the batch
/// A value may be serialized and stored into the batch or just "observed".
/// Shard 0 currently "observes" all values in order to accurately track
/// relation sizes. In the case of "observed" values, we only need to know
/// the key and LSN, so two types of metadata are supported to save on network
/// bandwidth.
pub enum ValueMeta {
Serialized(SerializedValueMeta),
Observed(ObservedValueMeta),
}
impl ValueMeta {
pub fn key(&self) -> CompactKey {
match self {
Self::Serialized(ser) => ser.key,
Self::Observed(obs) => obs.key,
}
}
pub fn lsn(&self) -> Lsn {
match self {
Self::Serialized(ser) => ser.lsn,
Self::Observed(obs) => obs.lsn,
}
}
}
/// Wrapper around [`ValueMeta`] that implements ordering by
/// (key, LSN) tuples
struct OrderedValueMeta(ValueMeta);
impl Ord for OrderedValueMeta {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
(self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn()))
}
}
impl PartialOrd for OrderedValueMeta {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for OrderedValueMeta {
fn eq(&self, other: &Self) -> bool {
(self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn())
}
}
impl Eq for OrderedValueMeta {}
/// Metadata for a [`Value`] serialized into the batch.
pub struct SerializedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
/// Starting offset of the value for the (key, LSN) tuple
/// in [`SerializedValueBatch::raw`]
pub batch_offset: u64,
pub len: usize,
pub will_init: bool,
}
/// Metadata for a [`Value`] observed by the batch
pub struct ObservedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
}
/// Batch of serialized [`Value`]s.
pub struct SerializedValueBatch {
/// [`Value`]s serialized in EphemeralFile's native format,
/// ready for disk write by the pageserver
pub raw: Vec<u8>,
/// Metadata to make sense of the bytes in [`Self::raw`]
/// and represent "observed" values.
///
/// Invariant: Metadata entries for any given key are ordered
/// by LSN. Note that entries for a key do not have to be contiguous.
pub metadata: Vec<ValueMeta>,
/// The highest LSN of any value in the batch
pub max_lsn: Lsn,
/// Number of values encoded by [`Self::raw`]
pub len: usize,
}
impl Default for SerializedValueBatch {
fn default() -> Self {
Self {
raw: Default::default(),
metadata: Default::default(),
max_lsn: Lsn(0),
len: 0,
}
}
}
impl SerializedValueBatch {
/// Build a batch of serialized values from a decoded PG WAL record
///
/// The batch will only contain values for keys targeting the specifiec
/// shard. Shard 0 is a special case, where any keys that don't belong to
/// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`],
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
pub(crate) fn from_decoded_filtered(
decoded: DecodedWALRecord,
shard: &ShardIdentity,
record_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<SerializedValueBatch> {
// First determine how big the buffer needs to be and allocate it up-front.
// This duplicates some of the work below, but it's empirically much faster.
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
let mut max_lsn: Lsn = Lsn(0);
let mut len: usize = 0;
for blk in decoded.blocks.iter() {
let relative_off = buf.len() as u64;
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !key.is_valid_key_on_write_path() {
anyhow::bail!("Unsupported key decoded at LSN {}: {}", record_end_lsn, key);
}
let key_is_local = shard.is_key_local(&key);
tracing::debug!(
lsn=%record_end_lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
metadata.push(ValueMeta::Observed(ObservedValueMeta {
key: key.to_compact(),
lsn: record_end_lsn,
}))
}
continue;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let val = if Self::block_is_image(&decoded, blk, pg_version) {
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, record_end_lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
val.ser_into(&mut buf)
.expect("Writing into in-memory buffer is infallible");
let val_ser_size = buf.len() - relative_off as usize;
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
key: key.to_compact(),
lsn: record_end_lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
max_lsn = std::cmp::max(max_lsn, record_end_lsn);
len += 1;
}
if cfg!(any(debug_assertions, test)) {
let batch = Self {
raw: buf,
metadata,
max_lsn,
len,
};
batch.validate_lsn_order();
return Ok(batch);
}
Ok(Self {
raw: buf,
metadata,
max_lsn,
len,
})
}
/// Look into the decoded PG WAL record and determine
/// roughly how large the buffer for serialized values needs to be.
fn estimate_buffer_size(
decoded: &DecodedWALRecord,
shard: &ShardIdentity,
pg_version: u32,
) -> usize {
let mut estimate: usize = 0;
for blk in decoded.blocks.iter() {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !shard.is_key_local(&key) {
continue;
}
if Self::block_is_image(decoded, blk, pg_version) {
// 4 bytes for the Value::Image discriminator
// 8 bytes for encoding the size of the buffer
// BLCKSZ for the raw image
estimate += (4 + 8 + BLCKSZ) as usize;
} else {
// 4 bytes for the Value::WalRecord discriminator
// 4 bytes for the NeonWalRecord::Postgres discriminator
// 1 bytes for NeonWalRecord::Postgres::will_init
// 8 bytes for encoding the size of the buffer
// length of the raw record
estimate += 8 + 1 + 8 + decoded.record.len();
}
}
estimate
}
fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
}
/// Encode a list of values and metadata into a serialized batch
///
/// This is used by the pageserver ingest code to conveniently generate
/// batches for metadata writes.
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
let mut buf = Vec::<u8>::with_capacity(buffer_size);
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(batch.len());
let mut max_lsn: Lsn = Lsn(0);
let len = batch.len();
for (key, lsn, val_ser_size, val) in batch {
let relative_off = buf.len() as u64;
val.ser_into(&mut buf)
.expect("Writing into in-memory buffer is infallible");
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
key,
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
max_lsn = std::cmp::max(max_lsn, lsn);
}
// Assert that we didn't do any extra allocations while building buffer.
debug_assert!(buf.len() <= buffer_size);
if cfg!(any(debug_assertions, test)) {
let batch = Self {
raw: buf,
metadata,
max_lsn,
len,
};
batch.validate_lsn_order();
return batch;
}
Self {
raw: buf,
metadata,
max_lsn,
len,
}
}
/// Add one value to the batch
///
/// This is used by the pageserver ingest code to include metadata block
/// updates for a single key.
pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) {
let relative_off = self.raw.len() as u64;
value.ser_into(&mut self.raw).unwrap();
let val_ser_size = self.raw.len() - relative_off as usize;
self.metadata
.push(ValueMeta::Serialized(SerializedValueMeta {
key,
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: value.will_init(),
}));
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
self.len += 1;
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Extend with the contents of another batch
///
/// One batch is generated for each decoded PG WAL record.
/// They are then merged to accumulate reasonably sized writes.
pub fn extend(&mut self, mut other: SerializedValueBatch) {
let extend_batch_start_offset = self.raw.len() as u64;
self.raw.extend(other.raw);
// Shift the offsets in the batch we are extending with
other.metadata.iter_mut().for_each(|meta| match meta {
ValueMeta::Serialized(ser) => {
ser.batch_offset += extend_batch_start_offset;
if cfg!(debug_assertions) {
let value_end = ser.batch_offset + ser.len as u64;
assert!((value_end as usize) <= self.raw.len());
}
}
ValueMeta::Observed(_) => {}
});
self.metadata.extend(other.metadata);
self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn);
self.len += other.len;
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Add zero images for the (key, LSN) tuples specified
///
/// PG versions below 16 do not zero out pages before extending
/// a relation and may leave gaps. Such gaps need to be identified
/// by the pageserver ingest logic and get patched up here.
///
/// Note that this function does not validate that the gaps have been
/// identified correctly (it does not know relation sizes), so it's up
/// to the call-site to do it properly.
pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) {
// Implementation note:
//
// Values within [`SerializedValueBatch::raw`] do not have any ordering requirements,
// but the metadata entries should be ordered properly (see
// [`SerializedValueBatch::metadata`]).
//
// Exploiting this observation we do:
// 1. Drain all the metadata entries into an ordered set.
// The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never
// includes more than one update to the same block in the same WAL record.
// 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer
// and add an index entry to the ordered metadata set.
// 3. Drain the ordered set back into a metadata vector
let mut ordered_metas = self
.metadata
.drain(..)
.map(OrderedValueMeta)
.collect::<BTreeSet<_>>();
for (keyspace, lsn) in gaps {
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
for gap_range in keyspace.ranges {
let mut key = gap_range.start;
while key != gap_range.end {
let relative_off = self.raw.len() as u64;
// TODO(vlad): Can we be cheeky and write only one zero image, and
// make all index entries requiring a zero page point to it?
// Alternatively, we can change the index entry format to represent zero pages
// without writing them at all.
Value::Image(ZERO_PAGE.clone())
.ser_into(&mut self.raw)
.unwrap();
let val_ser_size = self.raw.len() - relative_off as usize;
ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized(
SerializedValueMeta {
key: key.to_compact(),
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: true,
},
)));
self.len += 1;
key = key.next();
}
}
}
self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect();
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Checks if the batch is empty
///
/// A batch is empty when it contains no serialized values.
/// Note that it may still contain observed values.
pub fn is_empty(&self) -> bool {
let empty = self.raw.is_empty();
if cfg!(debug_assertions) && empty {
assert!(self
.metadata
.iter()
.all(|meta| matches!(meta, ValueMeta::Observed(_))));
}
empty
}
/// Returns the number of values serialized in the batch
pub fn len(&self) -> usize {
self.len
}
/// Returns the size of the buffer wrapped by the batch
pub fn buffer_size(&self) -> usize {
self.raw.len()
}
pub fn updates_key(&self, key: &Key) -> bool {
self.metadata.iter().any(|meta| match meta {
ValueMeta::Serialized(ser) => key.to_compact() == ser.key,
ValueMeta::Observed(_) => false,
})
}
pub fn validate_lsn_order(&self) {
use std::collections::HashMap;
let mut last_seen_lsn_per_key: HashMap<CompactKey, Lsn> = HashMap::default();
for meta in self.metadata.iter() {
let lsn = meta.lsn();
let key = meta.key();
if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) {
assert!(
lsn >= prev_lsn,
"Ordering violated by {}: {} < {}",
Key::from_compact(key),
lsn,
prev_lsn
);
}
}
}
}
#[cfg(all(test, feature = "testing"))]
mod tests {
use super::*;
fn validate_batch(
batch: &SerializedValueBatch,
values: &[(CompactKey, Lsn, usize, Value)],
gaps: Option<&Vec<(KeySpace, Lsn)>>,
) {
// Invariant 1: The metadata for a given entry in the batch
// is correct and can be used to deserialize back to the original value.
for (key, lsn, size, value) in values.iter() {
let meta = batch
.metadata
.iter()
.find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn))
.unwrap();
let meta = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => unreachable!(),
};
assert_eq!(meta.len, *size);
assert_eq!(meta.will_init, value.will_init());
let start = meta.batch_offset as usize;
let end = meta.batch_offset as usize + meta.len;
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
assert_eq!(&value_from_batch, value);
}
let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum();
let mut gap_pages_count: usize = 0;
// Invariant 2: Zero pages were added for identified gaps and their metadata
// is correct.
if let Some(gaps) = gaps {
for (gap_keyspace, lsn) in gaps {
for gap_range in &gap_keyspace.ranges {
let mut gap_key = gap_range.start;
while gap_key != gap_range.end {
let meta = batch
.metadata
.iter()
.find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn))
.unwrap();
let meta = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => unreachable!(),
};
let zero_value = Value::Image(ZERO_PAGE.clone());
let zero_value_size = zero_value.serialized_size().unwrap() as usize;
assert_eq!(meta.len, zero_value_size);
assert_eq!(meta.will_init, zero_value.will_init());
let start = meta.batch_offset as usize;
let end = meta.batch_offset as usize + meta.len;
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
assert_eq!(value_from_batch, zero_value);
gap_pages_count += 1;
expected_buffer_size += zero_value_size;
gap_key = gap_key.next();
}
}
}
}
// Invariant 3: The length of the batch is equal to the number
// of values inserted, plus the number of gap pages. This extends
// to the raw buffer size.
assert_eq!(batch.len(), values.len() + gap_pages_count);
assert_eq!(expected_buffer_size, batch.buffer_size());
// Invariant 4: Metadata entries for any given key are sorted in LSN order.
batch.validate_lsn_order();
}
#[test]
fn test_creation_from_values() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("baz")),
),
(
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let batch = SerializedValueBatch::from_values(values.clone());
validate_batch(&batch, &values, None);
assert!(!batch.is_empty());
}
#[test]
fn test_put() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
];
let mut values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
validate_batch(&batch, &values, None);
let value = (
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("baz")),
);
let serialized_size = value.2.serialized_size().unwrap() as usize;
let value = (value.0, value.1, serialized_size, value.2);
values.push(value.clone());
batch.put(value.0, value.3, value.1);
validate_batch(&batch, &values, None);
let value = (
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
);
let serialized_size = value.2.serialized_size().unwrap() as usize;
let value = (value.0, value.1, serialized_size, value.2);
values.push(value.clone());
batch.put(value.0, value.3, value.1);
validate_batch(&batch, &values, None);
}
#[test]
fn test_extension() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let mut values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
let other_values = vec![
(
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.next().next().to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let other_values = other_values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let other_batch = SerializedValueBatch::from_values(other_values.clone());
values.extend(other_values);
batch.extend(other_batch);
validate_batch(&batch, &values, None);
}
#[test]
fn test_gap_zeroing() {
const LSN: Lsn = Lsn(0x10);
let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let rel_bar_base_key = {
let mut key = rel_foo_base_key;
key.field4 += 1;
key
};
let values = vec![
(
rel_foo_base_key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo1")),
),
(
rel_foo_base_key.add(1).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo2")),
),
(
rel_foo_base_key.add(5).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo3")),
),
(
rel_foo_base_key.add(1).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo4")),
),
(
rel_foo_base_key.add(10).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo5")),
),
(
rel_foo_base_key.add(11).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo6")),
),
(
rel_foo_base_key.add(12).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo7")),
),
(
rel_bar_base_key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar1")),
),
(
rel_bar_base_key.add(4).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar2")),
),
];
let values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
let gaps = vec![
(
KeySpace {
ranges: vec![
rel_foo_base_key.add(2)..rel_foo_base_key.add(5),
rel_bar_base_key.add(1)..rel_bar_base_key.add(4),
],
},
LSN,
),
(
KeySpace {
ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)],
},
Lsn(LSN.0 + 0x10),
),
];
batch.zero_gaps(gaps.clone());
validate_batch(&batch, &values, Some(&gaps));
}
}

View File

@@ -9,6 +9,7 @@ use pageserver::{
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
task_mgr::TaskKind,
tenant::storage_layer::inmemory_layer::SerializedBatch,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
@@ -17,7 +18,6 @@ use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
// A very cheap hash for generating non-sequential keys.
fn murmurhash32(mut h: u32) -> u32 {
@@ -102,13 +102,13 @@ async fn ingest(
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
if batch.len() >= BATCH_SIZE {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedValueBatch::from_values(this_batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
layer.put_batch(serialized, &ctx).await?;
}
}
if !batch.is_empty() {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedValueBatch::from_values(this_batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
layer.put_batch(serialized, &ctx).await?;
}
layer.freeze(lsn + 1).await;

View File

@@ -19,8 +19,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
| Scope::SafekeeperData
| Scope::GenerationsApi
| Scope::Infra
| Scope::Scrubber
| Scope::ControllerPeer,
| Scope::Scrubber,
_,
) => Err(AuthError(
format!(

View File

@@ -154,17 +154,13 @@ fn main() -> anyhow::Result<()> {
},
};
if conf.no_sync {
info!("Skipping syncfs on startup");
} else {
let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
}
let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
}
// Initialize up failpoints support
@@ -402,7 +398,9 @@ fn start_pageserver(
ControllerUpcallClient::new(conf, &shutdown_pageserver),
conf,
);
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
if let Some(deletion_workers) = deletion_workers {
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
}
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.

View File

@@ -69,7 +69,6 @@ pub struct PageServerConf {
pub wal_redo_timeout: Duration,
pub superuser: String,
pub locale: String,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
@@ -179,9 +178,6 @@ pub struct PageServerConf {
/// Direct IO settings
pub virtual_file_io_mode: virtual_file::IoMode,
/// Optionally disable disk syncs (unsafe!)
pub no_sync: bool,
}
/// Token for authentication to safekeepers
@@ -302,7 +298,6 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
locale,
page_cache_size,
max_file_descriptors,
pg_distrib_dir,
@@ -337,7 +332,6 @@ impl PageServerConf {
concurrent_tenant_size_logical_size_queries,
virtual_file_io_engine,
tenant_config,
no_sync,
} = config_toml;
let mut conf = PageServerConf {
@@ -350,7 +344,6 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
locale,
page_cache_size,
max_file_descriptors,
http_auth_type,
@@ -416,7 +409,6 @@ impl PageServerConf {
.map(crate::l0_flush::L0FlushConfig::from)
.unwrap_or_default(),
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
no_sync: no_sync.unwrap_or(false),
};
// ------------------------------------------------------------

View File

@@ -618,11 +618,13 @@ impl DeletionQueue {
/// Caller may use the returned object to construct clients with new_client.
/// Caller should tokio::spawn the background() members of the two worker objects returned:
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
///
/// If remote_storage is None, then the returned workers will also be None.
pub fn new<C>(
remote_storage: GenericRemoteStorage,
controller_upcall_client: Option<C>,
conf: &'static PageServerConf,
) -> (Self, DeletionQueueWorkers<C>)
) -> (Self, Option<DeletionQueueWorkers<C>>)
where
C: ControlPlaneGenerationsApi + Send + Sync,
{
@@ -654,7 +656,7 @@ impl DeletionQueue {
},
cancel: cancel.clone(),
},
DeletionQueueWorkers {
Some(DeletionQueueWorkers {
frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
backend: Validator::new(
conf,
@@ -665,7 +667,7 @@ impl DeletionQueue {
cancel.clone(),
),
executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
},
}),
)
}
@@ -740,7 +742,9 @@ mod test {
);
tracing::debug!("Spawning worker for new queue queue");
let worker_join = workers.spawn_with(&tokio::runtime::Handle::current());
let worker_join = workers
.unwrap()
.spawn_with(&tokio::runtime::Handle::current());
let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
@@ -851,6 +855,7 @@ mod test {
harness.conf,
);
let worker = worker.unwrap();
let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
Ok(TestSetup {

View File

@@ -37,7 +37,6 @@ use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TenantState;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TimelineCreateRequestMode;
use pageserver_api::models::TimelinesInfoAndOffloaded;
@@ -81,7 +80,6 @@ use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::offload::offload_timeline;
use crate::tenant::timeline::offload::OffloadError;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
@@ -296,9 +294,6 @@ impl From<GetActiveTenantError> for ApiError {
GetActiveTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
ApiError::ShuttingDown
}
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -2002,19 +1997,14 @@ async fn timeline_offload_handler(
"timeline has attached children".into(),
));
}
if let (false, reason) = timeline.can_offload() {
if !timeline.can_offload() {
return Err(ApiError::PreconditionFailed(
format!("Timeline::can_offload() check failed: {}", reason) .into(),
"Timeline::can_offload() returned false".into(),
));
}
offload_timeline(&tenant, &timeline)
.await
.map_err(|e| {
match e {
OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
_ => ApiError::InternalServerError(anyhow!(e))
}
})?;
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
@@ -2070,7 +2060,6 @@ async fn timeline_checkpoint_handler(
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
@@ -2169,21 +2158,6 @@ async fn timeline_detach_ancestor_handler(
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
// Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
// during shutdown. This early upload ensures the pageserver does not need to upload too many
// things and creates downtime during timeline reloads.
for timeline in tenant.list_timelines() {
timeline
.remote_client
.wait_completion()
.await
.map_err(|e| {
ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
})?;
}
tracing::info!("all timeline upload queues are drained");
let timeline = tenant.get_timeline(timeline_id, true)?;
let progress = timeline

View File

@@ -1,11 +1,10 @@
//! The Page Service listens for client connections and serves their GetPage@LSN
//! requests.
use anyhow::{bail, Context};
use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use futures::FutureExt;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
@@ -1222,222 +1221,6 @@ impl PageServerHandler {
}
}
/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
#[derive(Debug, Clone, Eq, PartialEq)]
struct BaseBackupCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Option<Lsn>,
gzip: bool,
replica: bool,
}
/// `fullbackup tenant timeline [lsn] [prev_lsn]`
#[derive(Debug, Clone, Eq, PartialEq)]
struct FullBackupCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
}
/// `pagestream_v2 tenant timeline`
#[derive(Debug, Clone, Eq, PartialEq)]
struct PageStreamCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
}
/// `lease lsn tenant timeline lsn`
#[derive(Debug, Clone, Eq, PartialEq)]
struct LeaseLsnCmd {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
}
#[derive(Debug, Clone, Eq, PartialEq)]
enum PageServiceCmd {
Set,
PageStream(PageStreamCmd),
BaseBackup(BaseBackupCmd),
FullBackup(FullBackupCmd),
LeaseLsn(LeaseLsnCmd),
}
impl PageStreamCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 2 {
bail!(
"invalid number of parameters for pagestream command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
Ok(Self {
tenant_id,
timeline_id,
})
}
}
impl FullBackupCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() < 2 || parameters.len() > 4 {
bail!(
"invalid number of parameters for basebackup command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = parameters.get(2) {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
)
} else {
None
};
Ok(Self {
tenant_id,
timeline_id,
lsn,
prev_lsn,
})
}
}
impl BaseBackupCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() < 2 {
bail!(
"invalid number of parameters for basebackup command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
let lsn;
let flags_parse_from;
if let Some(maybe_lsn) = parameters.get(2) {
if *maybe_lsn == "latest" {
lsn = None;
flags_parse_from = 3;
} else if maybe_lsn.starts_with("--") {
lsn = None;
flags_parse_from = 2;
} else {
lsn = Some(
Lsn::from_str(maybe_lsn)
.with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
);
flags_parse_from = 3;
}
} else {
lsn = None;
flags_parse_from = 2;
}
let mut gzip = false;
let mut replica = false;
for &param in &parameters[flags_parse_from..] {
match param {
"--gzip" => {
if gzip {
bail!("duplicate parameter for basebackup command: {param}")
}
gzip = true
}
"--replica" => {
if replica {
bail!("duplicate parameter for basebackup command: {param}")
}
replica = true
}
_ => bail!("invalid parameter for basebackup command: {param}"),
}
}
Ok(Self {
tenant_id,
timeline_id,
lsn,
gzip,
replica,
})
}
}
impl LeaseLsnCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 3 {
bail!(
"invalid number of parameters for lease lsn command: {}",
query
);
}
let tenant_shard_id = TenantShardId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
let lsn = Lsn::from_str(parameters[2])
.with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
Ok(Self {
tenant_shard_id,
timeline_id,
lsn,
})
}
}
impl PageServiceCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let query = query.trim();
let Some((cmd, other)) = query.split_once(' ') else {
bail!("cannot parse query: {query}")
};
match cmd.to_ascii_lowercase().as_str() {
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
"lease" => {
let Some((cmd2, other)) = other.split_once(' ') else {
bail!("invalid lease command: {cmd}");
};
let cmd2 = cmd2.to_ascii_lowercase();
if cmd2 == "lsn" {
Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
} else {
bail!("invalid lease command: {cmd}");
}
}
"set" => Ok(Self::Set),
_ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
}
}
}
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1494,137 +1277,206 @@ where
fail::fail_point!("ps::connection-start::process-query");
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string}");
let query = PageServiceCmd::parse(query_string)?;
match query {
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
debug!("process query {query_string:?}");
let parts = query_string.split_whitespace().collect::<Vec<_>>();
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
PageServiceCmd::BaseBackup(BaseBackupCmd {
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
lsn,
gzip,
replica,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
)));
}
self.check_permission(Some(tenant_id))?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
let res = async {
self.handle_basebackup_request(
pgb,
tenant_id,
timeline_id,
lsn,
None,
false,
gzip,
replica,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Result::<(), QueryError>::Ok(())
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();
let mut lsn = None;
let mut replica = false;
let mut gzip = false;
for param in &params[2..] {
if param.starts_with("--") {
match *param {
"--gzip" => gzip = true,
"--replica" => replica = true,
_ => {
return Err(QueryError::Other(anyhow::anyhow!(
"Unknown parameter {param}",
)))
}
}
} else {
lsn = Some(
Lsn::from_str(param)
.with_context(|| format!("Failed to parse Lsn from {param}"))?,
);
}
.await;
metric_recording.observe(&res);
res?;
}
// same as basebackup, but result includes relational data as well
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn,
prev_lsn,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();
// Check that the timeline exists
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
let res = async {
self.handle_basebackup_request(
pgb,
tenant_id,
timeline_id,
lsn,
prev_lsn,
true,
false,
None,
false,
gzip,
replica,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Result::<(), QueryError>::Ok(())
}
PageServiceCmd::Set => {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
.await;
metric_recording.observe(&res);
res?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
)));
}
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = params.get(2) {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
)
} else {
None
};
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();
// Check that the timeline exists
self.handle_basebackup_request(
pgb,
tenant_id,
timeline_id,
lsn,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => {
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
}
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
prev_lsn,
true,
false,
false,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.to_ascii_lowercase().starts_with("set ") {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("lease lsn ") {
let params = &parts[2..];
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number {} for lease lsn command",
params.len()
)));
}
let tenant_shard_id = TenantShardId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();
// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"
)));
}
Ok(())
@@ -1673,181 +1525,3 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
);
debug_assert_current_span_has_tenant_and_timeline_id();
}
#[cfg(test)]
mod tests {
use utils::shard::ShardCount;
use super::*;
#[test]
fn pageservice_cmd_parse() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let cmd =
PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: false,
replica: false
})
);
let cmd =
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: true,
replica: false
})
);
let cmd =
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: false,
replica: false
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: false,
replica: false
})
);
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --replica --gzip"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: true,
replica: true
})
);
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: true,
replica: true
})
);
let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn: None,
prev_lsn: None
})
);
let cmd = PageServiceCmd::parse(&format!(
"fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
})
);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let cmd = PageServiceCmd::parse(&format!(
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
timeline_id,
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
})
);
let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
let cmd = PageServiceCmd::parse(&format!(
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
timeline_id,
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
})
);
let cmd = PageServiceCmd::parse("set a = b").unwrap();
assert_eq!(cmd, PageServiceCmd::Set);
let cmd = PageServiceCmd::parse("SET foo").unwrap();
assert_eq!(cmd, PageServiceCmd::Set);
}
#[test]
fn pageservice_cmd_err_handling() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let cmd = PageServiceCmd::parse("unknown_command");
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse("pagestream_v2");
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip --gzip"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip --unknown"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
assert!(cmd.is_err());
}
}

View File

@@ -24,7 +24,6 @@ use pageserver_api::key::{
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use pageserver_api::value::Value;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -39,13 +38,12 @@ use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::{bin_ser::BeSer, lsn::Lsn};
use wal_decoder::serialized_batch::SerializedValueBatch;
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
#[derive(Debug)]
pub enum LsnForTimestamp {
@@ -172,11 +170,12 @@ impl Timeline {
tline: self,
pending_lsns: Vec::new(),
pending_metadata_pages: HashMap::new(),
pending_data_batch: None,
pending_data_pages: Vec::new(),
pending_zero_data_pages: Default::default(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
pending_directory_entries: Vec::new(),
pending_metadata_bytes: 0,
pending_bytes: 0,
lsn,
}
}
@@ -1026,14 +1025,21 @@ pub struct DatadirModification<'a> {
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
/// which keys are stored here.
pending_data_batch: Option<SerializedValueBatch>,
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
// if we encounter a write from postgres in the same wal record, we will drop this entry.
//
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
pending_zero_data_pages: HashSet<CompactKey>,
/// For special "directory" keys that store key-value maps, track the size of the map
/// if it was updated in this modification.
pending_directory_entries: Vec<(DirectoryKind, usize)>,
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
pending_metadata_bytes: usize,
/// An **approximation** of how large our EphemeralFile write will be when committed.
pending_bytes: usize,
}
impl<'a> DatadirModification<'a> {
@@ -1048,17 +1054,11 @@ impl<'a> DatadirModification<'a> {
}
pub(crate) fn approx_pending_bytes(&self) -> usize {
self.pending_data_batch
.as_ref()
.map_or(0, |b| b.buffer_size())
+ self.pending_metadata_bytes
self.pending_bytes
}
pub(crate) fn has_dirty_data(&self) -> bool {
!self
.pending_data_batch
.as_ref()
.map_or(true, |b| b.is_empty())
pub(crate) fn has_dirty_data_pages(&self) -> bool {
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
}
/// Set the current lsn
@@ -1070,6 +1070,9 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
// If we are advancing LSN, then state from previous wal record should have been flushed.
assert!(self.pending_zero_data_pages.is_empty());
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
@@ -1144,107 +1147,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
/// Creates a relation if it is not already present.
/// Returns the current size of the relation
pub(crate) async fn create_relation_if_required(
&mut self,
rel: RelTag,
ctx: &RequestContext,
) -> Result<u32, PageReconstructError> {
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
Ok(nblocks)
} else if !self
.tline
.get_rel_exists(rel, Version::Modified(self), ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
self.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
Ok(0)
} else {
self.tline
.get_rel_size(rel, Version::Modified(self), ctx)
.await
}
}
/// Given a block number for a relation (which represents a newly written block),
/// the previous block count of the relation, and the shard info, find the gaps
/// that were created by the newly written block if any.
fn find_gaps(
rel: RelTag,
blkno: u32,
previous_nblocks: u32,
shard: &ShardIdentity,
) -> Option<KeySpace> {
let mut key = rel_block_to_key(rel, blkno);
let mut gap_accum = None;
for gap_blkno in previous_nblocks..blkno {
key.field6 = gap_blkno;
if shard.get_shard_number(&key) != shard.number {
continue;
}
gap_accum
.get_or_insert_with(KeySpaceAccum::new)
.add_key(key);
}
gap_accum.map(|accum| accum.to_keyspace())
}
pub async fn ingest_batch(
&mut self,
mut batch: SerializedValueBatch,
// TODO(vlad): remove this argument and replace the shard check with is_key_local
shard: &ShardIdentity,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut gaps_at_lsns = Vec::default();
for meta in batch.metadata.iter() {
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
let new_nblocks = blkno + 1;
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
if new_nblocks > old_nblocks {
self.put_rel_extend(rel, new_nblocks, ctx).await?;
}
if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
gaps_at_lsns.push((gaps, meta.lsn()));
}
}
if !gaps_at_lsns.is_empty() {
batch.zero_gaps(gaps_at_lsns);
}
match self.pending_data_batch.as_mut() {
Some(pending_batch) => {
pending_batch.extend(batch);
}
None if !batch.is_empty() => {
self.pending_data_batch = Some(batch);
}
None => {
// Nothing to initialize the batch with
}
}
Ok(())
}
/// Put a new page version that can be constructed from a WAL record
///
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1327,13 +1229,8 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
}
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
self.pending_zero_data_pages.insert(key.to_compact());
self.pending_bytes += ZERO_PAGE.len();
Ok(())
}
@@ -1351,16 +1248,19 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
}
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
self.pending_zero_data_pages.insert(key.to_compact());
self.pending_bytes += ZERO_PAGE.len();
Ok(())
}
/// Call this at the end of each WAL record.
pub(crate) fn on_record_end(&mut self) {
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
for key in pending_zero_data_pages {
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
}
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub async fn put_relmap_file(
&mut self,
@@ -1850,17 +1750,12 @@ impl<'a> DatadirModification<'a> {
let mut writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
if let Some(batch) = self.pending_data_batch.take() {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(batch, ctx).await?;
}
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(pending_data_pages, ctx).await?;
self.pending_bytes = 0;
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1880,6 +1775,9 @@ impl<'a> DatadirModification<'a> {
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
// Commit should never be called mid-wal-record
assert!(self.pending_zero_data_pages.is_empty());
let mut writer = self.tline.writer().await;
let pending_nblocks = self.pending_nblocks;
@@ -1887,49 +1785,21 @@ impl<'a> DatadirModification<'a> {
// Ordering: the items in this batch do not need to be in any global order, but values for
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
// this to do efficient updates to its index. See [`wal_decoder::serialized_batch`] for
// more details.
// this to do efficient updates to its index.
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
let metadata_batch = {
let pending_meta = self
.pending_metadata_pages
write_batch.extend(
self.pending_metadata_pages
.drain()
.flat_map(|(key, values)| {
values
.into_iter()
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
})
.collect::<Vec<_>>();
}),
);
if pending_meta.is_empty() {
None
} else {
Some(SerializedValueBatch::from_values(pending_meta))
}
};
let data_batch = self.pending_data_batch.take();
let maybe_batch = match (data_batch, metadata_batch) {
(Some(mut data), Some(metadata)) => {
data.extend(metadata);
Some(data)
}
(Some(data), None) => Some(data),
(None, Some(metadata)) => Some(metadata),
(None, None) => None,
};
if let Some(batch) = maybe_batch {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(batch, ctx).await?;
if !write_batch.is_empty() {
writer.put_batch(write_batch, ctx).await?;
}
if !self.pending_deletions.is_empty() {
@@ -1939,9 +1809,6 @@ impl<'a> DatadirModification<'a> {
self.pending_lsns.push(self.lsn);
for pending_lsn in self.pending_lsns.drain(..) {
// TODO(vlad): pretty sure the comment below is not valid anymore
// and we can call finish write with the latest LSN
//
// Ideally, we should be able to call writer.finish_write() only once
// with the highest LSN. However, the last_record_lsn variable in the
// timeline keeps track of the latest LSN and the immediate previous LSN
@@ -1957,14 +1824,14 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
self.pending_metadata_bytes = 0;
self.pending_bytes = 0;
Ok(())
}
pub(crate) fn len(&self) -> usize {
self.pending_metadata_pages.len()
+ self.pending_data_batch.as_ref().map_or(0, |b| b.len())
+ self.pending_data_pages.len()
+ self.pending_deletions.len()
}
@@ -2006,10 +1873,11 @@ impl<'a> DatadirModification<'a> {
// modifications before ingesting DB create operations, which are the only kind that reads
// data pages during ingest.
if cfg!(debug_assertions) {
assert!(!self
.pending_data_batch
.as_ref()
.map_or(false, |b| b.updates_key(&key)));
for (dirty_key, _, _, _) in &self.pending_data_pages {
debug_assert!(&key.to_compact() != dirty_key);
}
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
}
}
@@ -2027,10 +1895,18 @@ impl<'a> DatadirModification<'a> {
}
fn put_data(&mut self, key: CompactKey, val: Value) {
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key, val, self.lsn);
let val_serialized_size = val.serialized_size().unwrap() as usize;
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
// and the subsequent postgres-originating write
if self.pending_zero_data_pages.remove(&key) {
self.pending_bytes -= ZERO_PAGE.len();
}
self.pending_bytes += val_serialized_size;
self.pending_data_pages
.push((key, self.lsn, val_serialized_size, val))
}
fn put_metadata(&mut self, key: CompactKey, val: Value) {
@@ -2038,10 +1914,10 @@ impl<'a> DatadirModification<'a> {
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
// Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
self.pending_metadata_bytes -= *last_value_ser_size;
// Update the pending_bytes contribution from this entry, and update the serialized size in place
self.pending_bytes -= *last_value_ser_size;
*last_value_ser_size = val.serialized_size().unwrap() as usize;
self.pending_metadata_bytes += *last_value_ser_size;
self.pending_bytes += *last_value_ser_size;
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
// have been generated by synthesized zero page writes prior to the first real write to a page.
@@ -2051,12 +1927,8 @@ impl<'a> DatadirModification<'a> {
}
let val_serialized_size = val.serialized_size().unwrap() as usize;
self.pending_metadata_bytes += val_serialized_size;
self.pending_bytes += val_serialized_size;
values.push((self.lsn, val_serialized_size, val));
if key == CHECKPOINT_KEY.to_compact() {
tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
}
}
fn delete(&mut self, key_range: Range<Key>) {
@@ -2165,11 +2037,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
#[cfg(test)]
mod tests {
use hex_literal::hex;
use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
use utils::{
id::TimelineId,
shard::{ShardCount, ShardNumber},
};
use utils::id::TimelineId;
use super::*;
@@ -2223,93 +2091,6 @@ mod tests {
Ok(())
}
#[test]
fn gap_finding() {
let rel = RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
};
let base_blkno = 1;
let base_key = rel_block_to_key(rel, base_blkno);
let before_base_key = rel_block_to_key(rel, base_blkno - 1);
let shard = ShardIdentity::unsharded();
let mut previous_nblocks = 0;
for i in 0..10 {
let crnt_blkno = base_blkno + i;
let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
previous_nblocks = crnt_blkno + 1;
if i == 0 {
// The first block we write is 1, so we should find the gap.
assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
} else {
assert!(gaps.is_none());
}
}
// This is an update to an already existing block. No gaps here.
let update_blkno = 5;
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
assert!(gaps.is_none());
// This is an update past the current end block.
let after_gap_blkno = 20;
let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
let gap_start_key = rel_block_to_key(rel, previous_nblocks);
let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
assert_eq!(
gaps.unwrap(),
KeySpace::single(gap_start_key..after_gap_key)
);
}
#[test]
fn sharded_gap_finding() {
let rel = RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
};
let first_blkno = 6;
// This shard will get the even blocks
let shard = ShardIdentity::from_params(
ShardNumber(0),
&ShardParameters {
count: ShardCount(2),
stripe_size: ShardStripeSize(1),
},
);
// Only keys belonging to this shard are considered as gaps.
let mut previous_nblocks = 0;
let gaps =
DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
assert!(!gaps.ranges.is_empty());
for gap_range in gaps.ranges {
let mut k = gap_range.start;
while k != gap_range.end {
assert_eq!(shard.get_shard_number(&k), shard.number);
k = k.next();
}
}
previous_nblocks = first_blkno;
let update_blkno = 2;
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
assert!(gaps.is_none());
}
/*
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
let incremental = timeline.get_current_logical_size();

View File

@@ -2493,22 +2493,14 @@ impl Tenant {
timelines_to_compact_or_offload = timelines
.iter()
.filter_map(|(timeline_id, timeline)| {
let (is_active, (can_offload, _)) =
(timeline.is_active(), timeline.can_offload());
let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
let has_no_unoffloaded_children = {
!timelines
.iter()
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
};
let config_allows_offload = self.conf.timeline_offloading
|| self
.tenant_conf
.load()
.tenant_conf
.timeline_offloading
.unwrap_or_default();
let can_offload =
can_offload && has_no_unoffloaded_children && config_allows_offload;
can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
if (is_active, can_offload) == (false, false) {
None
} else {
@@ -2537,11 +2529,6 @@ impl Tenant {
.await
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
}
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
@@ -2557,7 +2544,8 @@ impl Tenant {
if pending_task_left == Some(false) && *can_offload {
offload_timeline(self, timeline)
.instrument(info_span!("offload_timeline", %timeline_id))
.await?;
.await
.map_err(timeline::CompactionError::Other)?;
}
}
@@ -4780,18 +4768,10 @@ async fn run_initdb(
let _permit = INIT_DB_SEMAPHORE.acquire().await;
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
initdb_command
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
.args(["--pgdata", initdb_target_dir.as_ref()])
.args(["--username", &conf.superuser])
.args(["--encoding", "utf8"])
.args(["--locale", &conf.locale])
.args(["--lc-collate", &conf.locale])
.args(["--lc-ctype", &conf.locale])
.args(["--lc-messages", &conf.locale])
.args(["--lc-monetary", &conf.locale])
.args(["--lc-numeric", &conf.locale])
.args(["--lc-time", &conf.locale])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
@@ -4801,27 +4781,15 @@ async fn run_initdb(
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())
// we would be interested in the stderr output, if there was any
.stderr(std::process::Stdio::piped());
// Before version 14, only the libc provide was available.
if pg_version > 14 {
// Version 17 brought with it a builtin locale provider which only provides
// C and C.UTF-8. While being safer for collation purposes since it is
// guaranteed to be consistent throughout a major release, it is also more
// performant.
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
initdb_command.args(["--locale-provider", locale_provider]);
}
let initdb_proc = initdb_command.spawn()?;
.stderr(std::process::Stdio::piped())
.spawn()?;
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_proc.wait_with_output().await?;
let initdb_output = initdb_command.wait_with_output().await?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(
initdb_output.status,
@@ -4930,7 +4898,6 @@ pub(crate) mod harness {
),
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),
}
}
}

View File

@@ -349,10 +349,6 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
#[serde(default)]
pub lsn_lease_length_for_ts: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub timeline_offloading: Option<bool>,
}
impl TenantConfOpt {
@@ -415,9 +411,6 @@ impl TenantConfOpt {
lsn_lease_length_for_ts: self
.lsn_lease_length_for_ts
.unwrap_or(global_conf.lsn_lease_length_for_ts),
timeline_offloading: self
.lazy_slru_download
.unwrap_or(global_conf.timeline_offloading),
}
}
}
@@ -471,7 +464,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
timeline_offloading: value.timeline_offloading,
}
}
}

View File

@@ -1959,7 +1959,7 @@ impl TenantManager {
attempt.before_reset_tenant();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Flush).await {
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value().expect("it was just shutdown");
}

View File

@@ -1445,7 +1445,7 @@ impl RemoteTimelineClient {
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
uploaded.metadata().shard,
self.tenant_shard_id.to_index(),
&uploaded.layer_desc().layer_name(),
uploaded.metadata().generation,
);
@@ -1486,7 +1486,7 @@ impl RemoteTimelineClient {
&adopted
.get_timeline_id()
.expect("Source timeline should be alive"),
adopted.metadata().shard,
self.tenant_shard_id.to_index(),
&adopted.layer_desc().layer_name(),
adopted.metadata().generation,
);
@@ -1494,7 +1494,7 @@ impl RemoteTimelineClient {
let target_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
adopted_as.metadata().shard,
self.tenant_shard_id.to_index(),
&adopted_as.layer_desc().layer_name(),
adopted_as.metadata().generation,
);
@@ -2201,18 +2201,6 @@ impl RemoteTimelineClient {
inner.initialized_mut()?;
Ok(UploadQueueAccessor { inner })
}
pub(crate) fn no_pending_work(&self) -> bool {
let inner = self.upload_queue.lock().unwrap();
match &*inner {
UploadQueue::Uninitialized
| UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
x.upload_queue_for_deletion.no_pending_work()
}
UploadQueue::Initialized(x) => x.no_pending_work(),
}
}
}
pub(crate) struct UploadQueueAccessor<'a> {

View File

@@ -12,7 +12,7 @@ pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use bytes::Bytes;
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
@@ -196,9 +196,6 @@ impl ValuesReconstructState {
/// Returns true if this was the last value needed for the key and false otherwise.
///
/// If the key is done after the update, mark it as such.
///
/// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
/// `key_done`.
pub(crate) fn update_key(
&mut self,
key: &Key,
@@ -209,18 +206,10 @@ impl ValuesReconstructState {
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => {
if is_sparse_key {
// Sparse keyspace might be visited multiple times because
// we don't track unmapped keyspaces.
return ValueReconstructSituation::Complete;
} else {
unreachable!()
}
}
ValueReconstructSituation::Complete => unreachable!(),
ValueReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
@@ -245,9 +234,7 @@ impl ValuesReconstructState {
if key_done && state.situation == ValueReconstructSituation::Continue {
state.situation = ValueReconstructSituation::Complete;
if !is_sparse_key {
self.keys_done.add_key(*key);
}
self.keys_done.add_key(*key);
}
state.situation

View File

@@ -12,7 +12,7 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Result};
use anyhow::{anyhow, Context, Result};
use camino::Utf8PathBuf;
use pageserver_api::key::CompactKey;
use pageserver_api::key::Key;
@@ -25,7 +25,6 @@ use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
@@ -67,8 +66,6 @@ pub struct InMemoryLayer {
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
estimated_in_mem_size: AtomicU64,
}
impl std::fmt::Debug for InMemoryLayer {
@@ -455,7 +452,6 @@ impl InMemoryLayer {
len,
will_init,
} = index_entry.unpack();
reads.entry(key).or_default().push(ValueRead {
entry_lsn: *entry_lsn,
read: vectored_dio_read::LogicalRead::new(
@@ -517,6 +513,68 @@ impl InMemoryLayer {
}
}
/// Offset of a particular Value within a serialized batch.
struct SerializedBatchOffset {
key: CompactKey,
lsn: Lsn,
// TODO: separate type when we start serde-serializing this value, to avoid coupling
// in-memory representation to serialization format.
index_entry: IndexEntry,
}
pub struct SerializedBatch {
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
pub(crate) raw: Vec<u8>,
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
offsets: Vec<SerializedBatchOffset>,
/// The highest LSN of any value in the batch
pub(crate) max_lsn: Lsn,
}
impl SerializedBatch {
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
let mut max_lsn: Lsn = Lsn(0);
for (key, lsn, val_ser_size, val) in batch {
let relative_off = cursor.position();
val.ser_into(&mut cursor)
.expect("Writing into in-memory buffer is infallible");
offsets.push(SerializedBatchOffset {
key,
lsn,
index_entry: IndexEntry::new(IndexEntryNewArgs {
base_offset: 0,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
})
.context("higher-level code ensures that values are within supported ranges")?,
});
max_lsn = std::cmp::max(max_lsn, lsn);
}
let buffer = cursor.into_inner();
// Assert that we didn't do any extra allocations while building buffer.
debug_assert!(buffer.len() <= buffer_size);
Ok(Self {
raw: buffer,
offsets,
max_lsn,
})
}
}
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
}
@@ -545,10 +603,6 @@ impl InMemoryLayer {
Ok(inner.file.len())
}
pub fn estimated_in_mem_size(&self) -> u64 {
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
}
/// Create a new, empty, in-memory layer
pub async fn create(
conf: &'static PageServerConf,
@@ -578,7 +632,6 @@ impl InMemoryLayer {
file,
resource_units: GlobalResourceUnits::new(),
}),
estimated_in_mem_size: AtomicU64::new(0),
})
}
@@ -589,7 +642,7 @@ impl InMemoryLayer {
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
pub async fn put_batch(
&self,
serialized_batch: SerializedValueBatch,
serialized_batch: SerializedBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut inner = self.inner.write().await;
@@ -597,13 +650,27 @@ impl InMemoryLayer {
let base_offset = inner.file.len();
let SerializedValueBatch {
let SerializedBatch {
raw,
metadata,
mut offsets,
max_lsn: _,
len: _,
} = serialized_batch;
// Add the base_offset to the batch's index entries which are relative to the batch start.
for offset in &mut offsets {
let IndexEntryUnpacked {
will_init,
len,
pos,
} = offset.index_entry.unpack();
offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
base_offset,
batch_offset: pos,
len: len.into_usize(),
will_init,
})?;
}
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
@@ -616,28 +683,12 @@ impl InMemoryLayer {
assert_eq!(new_size, expected_new_len);
// Update the index with the new entries
for meta in metadata {
let SerializedValueMeta {
key,
lsn,
batch_offset,
len,
will_init,
} = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => {
continue;
}
};
// Add the base_offset to the batch's index entries which are relative to the batch start.
let index_entry = IndexEntry::new(IndexEntryNewArgs {
base_offset,
batch_offset,
len,
will_init,
})?;
for SerializedBatchOffset {
key,
lsn,
index_entry,
} in offsets
{
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
if old.is_some() {
@@ -649,12 +700,6 @@ impl InMemoryLayer {
// because this case is unexpected, and we would like tests to fail if this happens.
warn!("Key {} at {} written twice at same LSN", key, lsn);
}
self.estimated_in_mem_size.fetch_add(
(std::mem::size_of::<CompactKey>()
+ std::mem::size_of::<Lsn>()
+ std::mem::size_of::<IndexEntry>()) as u64,
AtomicOrdering::Relaxed,
);
}
inner.resource_units.maybe_publish_size(new_size);

View File

@@ -279,7 +279,6 @@ fn log_compaction_error(
let decision = match e {
ShuttingDown => None,
Offload(_) => Some(LooksLike::Error),
_ if task_cancelled => Some(LooksLike::Info),
Other(e) => {
let root_cause = e.root_cause();

View File

@@ -20,13 +20,11 @@ use chrono::{DateTime, Utc};
use enumset::EnumSet;
use fail::fail_point;
use handle::ShardTimelineId;
use offload::OffloadError;
use once_cell::sync::Lazy;
use pageserver_api::{
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
key::{
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
NON_INHERITED_SPARSE_RANGE,
CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
},
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
models::{
@@ -50,7 +48,6 @@ use utils::{
fs_ext, pausable_failpoint,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -133,6 +130,7 @@ use crate::task_mgr::TaskKind;
use crate::tenant::gc_result::GcResult;
use crate::ZERO_PAGE;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -142,7 +140,9 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::{
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
config::TenantConf,
storage_layer::{inmemory_layer, LayerVisibilityHint},
upload_queue::NotInitialized,
MaybeOffloaded,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -156,9 +156,6 @@ use super::{
GcError,
};
#[cfg(test)]
use pageserver_api::value::Value;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) enum FlushLoopState {
NotStarted,
@@ -853,10 +850,6 @@ pub(crate) enum ShutdownMode {
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
FreezeAndFlush,
/// Only flush the layers to the remote storage without freezing any open layers. This is the
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
/// the generation number.
Flush,
/// Shut down immediately, without waiting for any open layers to flush.
Hard,
}
@@ -1570,16 +1563,12 @@ impl Timeline {
///
/// This is neccessary but not sufficient for offloading of the timeline as it might have
/// child timelines that are not offloaded yet.
pub(crate) fn can_offload(&self) -> (bool, &'static str) {
pub(crate) fn can_offload(&self) -> bool {
if self.remote_client.is_archived() != Some(true) {
return (false, "the timeline is not archived");
}
if !self.remote_client.no_pending_work() {
// if the remote client is still processing some work, we can't offload
return (false, "the upload queue is not drained yet");
return false;
}
(true, "ok")
true
}
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
@@ -1687,6 +1676,11 @@ impl Timeline {
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
debug_assert_current_span_has_tenant_and_timeline_id();
let try_freeze_and_flush = match mode {
ShutdownMode::FreezeAndFlush => true,
ShutdownMode::Hard => false,
};
// Regardless of whether we're going to try_freeze_and_flush
// or not, stop ingesting any more data. Walreceiver only provides
// cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1708,7 +1702,7 @@ impl Timeline {
// ... and inform any waiters for newer LSNs that there won't be any.
self.last_record_lsn.shutdown();
if let ShutdownMode::FreezeAndFlush = mode {
if try_freeze_and_flush {
if let Some((open, frozen)) = self
.layers
.read()
@@ -1750,20 +1744,6 @@ impl Timeline {
warn!("failed to freeze and flush: {e:#}");
}
}
// `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
// we also do a final check here to ensure that the queue is empty.
if !self.remote_client.no_pending_work() {
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
}
}
if let ShutdownMode::Flush = mode {
// drain the upload queue
self.remote_client.shutdown().await;
if !self.remote_client.no_pending_work() {
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
}
}
// Signal any subscribers to our cancellation token to drop out
@@ -3506,37 +3486,18 @@ impl Timeline {
let timer = self.metrics.flush_time_histo.start_timer();
let num_frozen_layers;
let frozen_layer_total_size;
let layer_to_flush = {
let guard = self.layers.read().await;
let Ok(lm) = guard.layer_map() else {
info!("dropping out of flush loop for timeline shutdown");
return;
};
num_frozen_layers = lm.frozen_layers.len();
frozen_layer_total_size = lm
.frozen_layers
.iter()
.map(|l| l.estimated_in_mem_size())
.sum::<u64>();
lm.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
let Some(layer_to_flush) = layer_to_flush else {
break Ok(());
};
if num_frozen_layers
> std::cmp::max(
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
)
&& frozen_layer_total_size >= /* 128 MB */ 128000000
{
tracing::warn!(
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
);
}
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(this_layer_to_lsn) => {
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
@@ -4127,7 +4088,6 @@ impl Timeline {
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
// Metadata keys image layer creation.
let mut reconstruct_state = ValuesReconstructState::default();
let begin = Instant::now();
let data = self
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
.await?;
@@ -4144,11 +4104,14 @@ impl Timeline {
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
};
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
let elapsed = begin.elapsed();
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
info!(
"metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
debug!(
trigger_generation,
delta_files_accessed,
total_kb_retrieved,
total_keys_retrieved,
"generate metadata images"
);
if !trigger_generation && mode == ImageLayerCreationMode::Try {
@@ -4512,23 +4475,11 @@ impl Drop for Timeline {
pub(crate) enum CompactionError {
#[error("The timeline or pageserver is shutting down")]
ShuttingDown,
/// Compaction tried to offload a timeline and failed
#[error("Failed to offload timeline: {0}")]
Offload(OffloadError),
/// Compaction cannot be done right now; page reconstruction and so on.
#[error(transparent)]
Other(anyhow::Error),
}
impl From<OffloadError> for CompactionError {
fn from(e: OffloadError) -> Self {
match e {
OffloadError::Cancelled => Self::ShuttingDown,
_ => Self::Offload(e),
}
}
}
impl CompactionError {
pub fn is_cancelled(&self) -> bool {
matches!(self, CompactionError::ShuttingDown)
@@ -5772,22 +5723,23 @@ impl<'a> TimelineWriter<'a> {
/// Put a batch of keys at the specified Lsns.
pub(crate) async fn put_batch(
&mut self,
batch: SerializedValueBatch,
batch: Vec<(CompactKey, Lsn, usize, Value)>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
if batch.is_empty() {
return Ok(());
}
let batch_max_lsn = batch.max_lsn;
let buf_size: u64 = batch.buffer_size() as u64;
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
let batch_max_lsn = serialized_batch.max_lsn;
let buf_size: u64 = serialized_batch.raw.len() as u64;
let action = self.get_open_layer_action(batch_max_lsn, buf_size);
let layer = self
.handle_open_layer_action(batch_max_lsn, action, ctx)
.await?;
let res = layer.put_batch(batch, ctx).await;
let res = layer.put_batch(serialized_batch, ctx).await;
if res.is_ok() {
// Update the current size only when the entire write was ok.
@@ -5822,14 +5774,11 @@ impl<'a> TimelineWriter<'a> {
);
}
let val_ser_size = value.serialized_size().unwrap() as usize;
let batch = SerializedValueBatch::from_values(vec![(
key.to_compact(),
lsn,
val_ser_size,
value.clone(),
)]);
self.put_batch(batch, ctx).await
self.put_batch(
vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
ctx,
)
.await
}
pub(crate) async fn delete_batch(

View File

@@ -18,7 +18,6 @@ use crate::{
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
TimelineOrOffloaded,
},
virtual_file::MaybeFatalIo,
};
use super::{Timeline, TimelineResources};
@@ -63,10 +62,10 @@ pub(super) async fn delete_local_timeline_directory(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline: &Timeline,
) {
) -> anyhow::Result<()> {
// Always ensure the lock order is compaction -> gc.
let compaction_lock = timeline.compaction_lock.lock();
let _compaction_lock = crate::timed(
let compaction_lock = crate::timed(
compaction_lock,
"acquires compaction lock",
std::time::Duration::from_secs(5),
@@ -74,7 +73,7 @@ pub(super) async fn delete_local_timeline_directory(
.await;
let gc_lock = timeline.gc_lock.lock();
let _gc_lock = crate::timed(
let gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
std::time::Duration::from_secs(5),
@@ -86,15 +85,24 @@ pub(super) async fn delete_local_timeline_directory(
let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});
// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
// failpoint timeline-delete-after-rm.
//
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
// no locks are shared.
tokio::fs::remove_dir_all(local_timeline_directory)
.await
.or_else(fs_ext::ignore_not_found)
.fatal_err("removing timeline directory");
.context("remove local timeline directory")?;
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
@@ -105,9 +113,17 @@ pub(super) async fn delete_local_timeline_directory(
let timeline_path = conf.timelines_path(&tenant_shard_id);
crashsafe::fsync_async(timeline_path)
.await
.fatal_err("fsync after removing timeline directory");
.context("fsync_pre_mark_remove")?;
info!("finished deleting layer files, releasing locks");
drop(gc_lock);
drop(compaction_lock);
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});
Ok(())
}
/// Removes remote layers and an index file after them.
@@ -424,20 +440,12 @@ impl DeleteTimelineFlow {
timeline: &TimelineOrOffloaded,
remote_client: Arc<RemoteTimelineClient>,
) -> Result<(), DeleteTimelineError> {
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});
// Offloaded timelines have no local state
// TODO: once we persist offloaded information, delete the timeline from there, too
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
}
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});
delete_remote_layers_and_index(&remote_client).await?;
pausable_failpoint!("in_progress_delete");

View File

@@ -12,7 +12,7 @@ use crate::{
virtual_file::{MaybeFatalIo, VirtualFile},
};
use anyhow::Context;
use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
use pageserver_api::models::detach_ancestor::AncestorDetached;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -376,14 +376,8 @@ pub(super) async fn prepare(
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let owned = remote_copy(
&adopted,
&timeline,
timeline.generation,
timeline.shard_identity,
&timeline.cancel,
)
.await?;
let owned =
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
tracing::info!(layer=%owned, "remote copied");
Ok(owned)
}
@@ -635,7 +629,6 @@ async fn remote_copy(
adopted: &Layer,
adoptee: &Arc<Timeline>,
generation: Generation,
shard_identity: ShardIdentity,
cancel: &CancellationToken,
) -> Result<Layer, Error> {
// depending if Layer::keep_resident we could hardlink
@@ -643,7 +636,6 @@ async fn remote_copy(
let mut metadata = adopted.metadata();
debug_assert!(metadata.generation <= generation);
metadata.generation = generation;
metadata.shard = shard_identity.shard_index();
let owned = crate::tenant::storage_layer::Layer::for_evicted(
adoptee.conf,

View File

@@ -3,40 +3,18 @@ use std::sync::Arc;
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
#[derive(thiserror::Error, Debug)]
pub(crate) enum OffloadError {
#[error("Cancelled")]
Cancelled,
#[error("Timeline is not archived")]
NotArchived,
#[error(transparent)]
RemoteStorage(anyhow::Error),
#[error("Unexpected offload error: {0}")]
Other(anyhow::Error),
}
impl From<TenantManifestError> for OffloadError {
fn from(e: TenantManifestError) -> Self {
match e {
TenantManifestError::Cancelled => Self::Cancelled,
TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
}
}
}
use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
pub(crate) async fn offload_timeline(
tenant: &Tenant,
timeline: &Arc<Timeline>,
) -> Result<(), OffloadError> {
) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
tracing::info!("offloading archived timeline");
let allow_offloaded_children = true;
let (timeline, guard) =
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
tracing::error!("timeline already offloaded, but given timeline object");
@@ -47,26 +25,28 @@ pub(crate) async fn offload_timeline(
match is_archived {
Some(true) => (),
Some(false) => {
tracing::warn!("tried offloading a non-archived timeline");
return Err(OffloadError::NotArchived);
tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
anyhow::bail!("timeline isn't archived");
}
None => {
// This is legal: calls to this function can race with the timeline shutting down
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
return Err(OffloadError::Cancelled);
tracing::warn!(
?is_archived,
"tried offloading a timeline where manifest is not yet available"
);
anyhow::bail!("timeline manifest hasn't been loaded yet");
}
}
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Flush).await;
timeline.shutdown(super::ShutdownMode::Hard).await;
// TODO extend guard mechanism above with method
// to make deletions possible while offloading is in progress
let conf = &tenant.conf;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
remove_timeline_from_tenant(tenant, &timeline, &guard);
remove_timeline_from_tenant(tenant, &timeline, &guard).await?;
{
let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
@@ -85,18 +65,21 @@ pub(crate) async fn offload_timeline(
// at the next restart attach it again.
// For that to happen, we'd need to make the manifest reflect our *intended* state,
// not our actual state of offloaded timelines.
tenant.store_tenant_manifest().await?;
tenant
.store_tenant_manifest()
.await
.map_err(|e| anyhow::anyhow!(e))?;
Ok(())
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
fn remove_timeline_from_tenant(
async fn remove_timeline_from_tenant(
tenant: &Tenant,
timeline: &Timeline,
_: &DeletionGuard, // using it as a witness
) {
) -> anyhow::Result<()> {
// Remove the timeline from the map.
let mut timelines = tenant.timelines.lock().unwrap();
let children_exist = timelines
@@ -112,4 +95,8 @@ fn remove_timeline_from_tenant(
timelines
.remove(&timeline.timeline_id)
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
drop(timelines);
Ok(())
}

View File

@@ -141,9 +141,7 @@ impl Drop for UninitializedTimeline<'_> {
fn drop(&mut self) {
if let Some((_, create_guard)) = self.raw_timeline.take() {
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
// This is unusual, but can happen harmlessly if the pageserver is stopped while
// creating a timeline.
info!("Timeline got dropped without initializing, cleaning its files");
error!("Timeline got dropped without initializing, cleaning its files");
cleanup_timeline_directory(create_guard);
}
}

View File

@@ -331,11 +331,11 @@ pub(super) async fn handle_walreceiver_connection(
Ok(())
}
while let Some((record_end_lsn, recdata)) = waldecoder.poll_decode()? {
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
if !record_end_lsn.is_aligned() {
if !lsn.is_aligned() {
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
@@ -343,7 +343,7 @@ pub(super) async fn handle_walreceiver_connection(
let interpreted = InterpretedWalRecord::from_bytes_filtered(
recdata,
modification.tline.get_shard_identity(),
record_end_lsn,
lsn,
modification.tline.pg_version,
)?;
@@ -366,11 +366,9 @@ pub(super) async fn handle_walreceiver_connection(
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
.with_context(|| {
format!("could not ingest record at {record_end_lsn}")
})?;
.with_context(|| format!("could not ingest record at {lsn}"))?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {record_end_lsn}");
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
@@ -380,7 +378,7 @@ pub(super) async fn handle_walreceiver_connection(
// to timeout the tests.
fail_point!("walreceiver-after-ingest");
last_rec_lsn = record_end_lsn;
last_rec_lsn = lsn;
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.

View File

@@ -28,13 +28,14 @@ use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::key::Key;
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::fsm_logical_to_physical;
use postgres_ffi::walrecord::*;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use wal_decoder::models::*;
use anyhow::{bail, Result};
use anyhow::{bail, Context, Result};
use bytes::{Buf, Bytes};
use tracing::*;
use utils::failpoint_support;
@@ -50,6 +51,7 @@ use crate::ZERO_PAGE;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::value::Value;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::TransactionId;
@@ -154,12 +156,12 @@ impl WalIngest {
WAL_INGEST.records_received.inc();
let prev_len = modification.len();
modification.set_lsn(interpreted.end_lsn)?;
modification.set_lsn(interpreted.lsn)?;
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
// Records of this type should always be preceded by a commit(), as they
// rely on reading data pages back from the Timeline.
assert!(!modification.has_dirty_data());
assert!(!modification.has_dirty_data_pages());
}
assert!(!self.checkpoint_modified);
@@ -273,9 +275,28 @@ impl WalIngest {
}
}
modification
.ingest_batch(interpreted.batch, &self.shard, ctx)
.await?;
// Iterate through all the key value pairs provided in the interpreted block
// and update the modification currently in-flight to include them.
for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
match maybe_value {
Some(Value::Image(img)) => {
self.put_rel_page_image(modification, rel, blk, img, ctx)
.await?;
}
Some(Value::WalRecord(rec)) => {
self.put_rel_wal_record(modification, rel, blk, rec, ctx)
.await?;
}
None => {
// Shard 0 tracks relation sizes. We will observe
// its blkno in case it implicitly extends a relation.
assert!(self.shard.is_shard_zero());
self.observe_decoded_block(modification, rel, blk, ctx)
.await?;
}
}
}
// If checkpoint data was updated, store the new version in the repository
if self.checkpoint_modified {
@@ -289,6 +310,8 @@ impl WalIngest {
// until commit() is called to flush the data into the repository and update
// the latest LSN.
modification.on_record_end();
Ok(modification.len() > prev_len)
}
@@ -311,6 +334,17 @@ impl WalIngest {
Ok((epoch as u64) << 32 | xid as u64)
}
/// Do not store this block, but observe it for the purposes of updating our relation size state.
async fn observe_decoded_block(
&mut self,
modification: &mut DatadirModification<'_>,
rel: RelTag,
blkno: BlockNumber,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
self.handle_rel_extend(modification, rel, blkno, ctx).await
}
async fn ingest_clear_vm_bits(
&mut self,
clear_vm_bits: ClearVmBits,
@@ -1214,7 +1248,6 @@ impl WalIngest {
Ok(())
}
#[cfg(test)]
async fn put_rel_page_image(
&mut self,
modification: &mut DatadirModification<'_>,
@@ -1264,7 +1297,36 @@ impl WalIngest {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
let old_nblocks = modification.create_relation_if_required(rel, ctx).await?;
// TODO: would be nice if to be more explicit about it
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = modification
.tline
.get_cached_rel_size(&rel, modification.get_lsn())
{
nblocks
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
modification
.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await?
};
if new_nblocks > old_nblocks {
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
@@ -1491,21 +1553,25 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1647,6 +1713,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_eq!(
tline
@@ -1672,6 +1739,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_eq!(
tline

View File

@@ -67,10 +67,7 @@ pub(crate) fn apply_in_neon(
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
map[map_byte as usize] &= !(flags << map_offset);
// The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
if !postgres_ffi::page_is_new(page) {
postgres_ffi::page_set_lsn(page, lsn);
}
postgres_ffi::page_set_lsn(page, lsn);
}
// Repeat for 'old_heap_blkno', if any
@@ -84,10 +81,7 @@ pub(crate) fn apply_in_neon(
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
map[map_byte as usize] &= !(flags << map_offset);
// The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
if !postgres_ffi::page_is_new(page) {
postgres_ffi::page_set_lsn(page, lsn);
}
postgres_ffi::page_set_lsn(page, lsn);
}
}
// Non-relational WAL records are handled here, with custom code that has the

View File

@@ -611,17 +611,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
recptr = startptr;
nbytes = count;
/* Try to read directly from WAL buffers first. */
#if PG_MAJORVERSION_NUM >= 17
{
Size rbytes;
rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
recptr += rbytes;
nbytes -= rbytes;
p += rbytes;
}
#endif
while (nbytes > 0)
{
uint32 startoff;

View File

@@ -1361,35 +1361,29 @@ SendAppendRequests(Safekeeper *sk)
if (sk->active_state == SS_ACTIVE_READ_WAL)
{
char *errmsg;
int req_len;
req = &sk->appendRequest;
req_len = req->endLsn - req->beginLsn;
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
if (req_len > 0)
switch (wp->api.wal_read(sk,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req->endLsn - req->beginLsn,
&errmsg))
{
switch (wp->api.wal_read(sk,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req_len,
&errmsg))
{
case NEON_WALREAD_SUCCESS:
break;
case NEON_WALREAD_WOULDBLOCK:
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
Assert(false);
}
case NEON_WALREAD_SUCCESS:
break;
case NEON_WALREAD_WOULDBLOCK:
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
Assert(false);
}
sk->outbuf.len += req_len;
sk->outbuf.len += req->endLsn - req->beginLsn;
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);

View File

@@ -1489,11 +1489,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
{
NeonWALReadResult res;
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
#if PG_MAJORVERSION_NUM >= 17
if (!sk->wp->config->syncSafekeepers)
{
Size rbytes;
rbytes = WALReadFromBuffers(buf, startptr, count,
walprop_pg_get_timeline_id());
startptr += rbytes;
count -= rbytes;
}
#endif
if (count == 0)
{
res = NEON_WALREAD_SUCCESS;
}
else
{
Assert(count > 0);
/* Now read the remaining WAL from the WAL file */
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
}
if (res == NEON_WALREAD_SUCCESS)
{

155
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -2106,78 +2106,83 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
[[package]]
name = "psycopg2-binary"
version = "2.9.10"
version = "2.9.9"
description = "psycopg2 - Python-PostgreSQL Database Adapter"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3e9c76f0ac6f92ecfc79516a8034a544926430f7b080ec5a0537bca389ee0906"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad26b467a405c798aaa1458ba09d7e2b6e5f96b1ce0ac15d82fd9f95dc38a92"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:270934a475a0e4b6925b5f804e3809dd5f90f8613621d062848dd82f9cd62007"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48b338f08d93e7be4ab2b5f1dbe69dc5e9ef07170fe1f86514422076d9c010d0"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4152f8f76d2023aac16285576a9ecd2b11a9895373a1f10fd9db54b3ff06b4"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32581b3020c72d7a421009ee1c6bf4a131ef5f0a968fab2e2de0c9d2bb4577f1"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2ce3e21dc3437b1d960521eca599d57408a695a0d3c26797ea0f72e834c7ffe5"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e984839e75e0b60cfe75e351db53d6db750b00de45644c5d1f7ee5d1f34a1ce5"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c4745a90b78e51d9ba06e2088a2fe0c693ae19cc8cb051ccda44e8df8a6eb53"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-win32.whl", hash = "sha256:e5720a5d25e3b99cd0dc5c8a440570469ff82659bb09431c1439b92caf184d3b"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:3c18f74eb4386bf35e92ab2354a12c17e5eb4d9798e4c0ad3a00783eae7cd9f1"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:056470c3dc57904bbf63d6f534988bafc4e970ffd50f6271fc4ee7daad9498a5"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aa0e31fa4bb82578f3a6c74a73c273367727de397a7a0f07bd83cbea696baa"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8de718c0e1c4b982a54b41779667242bc630b2197948405b7bd8ce16bcecac92"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5c370b1e4975df846b0277b4deba86419ca77dbc25047f535b0bb03d1a544d44"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ffe8ed017e4ed70f68b7b371d84b7d4a790368db9203dfc2d222febd3a9c8863"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8aecc5e80c63f7459a1a2ab2c64df952051df196294d9f739933a9f6687e86b3"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:7a813c8bdbaaaab1f078014b9b0b13f5de757e2b5d9be6403639b298a04d218b"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00924255d7fc916ef66e4bf22f354a940c67179ad3fd7067d7a0a9c84d2fbfc"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7559bce4b505762d737172556a4e6ea8a9998ecac1e39b5233465093e8cee697"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8b58f0a96e7a1e341fc894f62c1177a7c83febebb5ff9123b579418fdc8a481"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b269105e59ac96aba877c1707c600ae55711d9dcd3fc4b5012e4af68e30c648"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:79625966e176dc97ddabc142351e0409e28acf4660b88d1cf6adb876d20c490d"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8aabf1c1a04584c168984ac678a668094d831f152859d06e055288fa515e4d30"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:19721ac03892001ee8fdd11507e6a2e01f4e37014def96379411ca99d78aeb2c"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7f5d859928e635fa3ce3477704acee0f667b3a3d3e4bb109f2b18d4005f38287"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-win32.whl", hash = "sha256:3216ccf953b3f267691c90c6fe742e45d890d8272326b4a8b20850a03d05b7b8"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"},
{file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
]
[[package]]
@@ -3008,13 +3013,13 @@ files = [
[[package]]
name = "types-psycopg2"
version = "2.9.21.20241019"
version = "2.9.21.10"
description = "Typing stubs for psycopg2"
optional = false
python-versions = ">=3.8"
python-versions = "*"
files = [
{file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
{file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
{file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"},
{file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"},
]
[[package]]
@@ -3484,4 +3489,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508"
content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"

View File

@@ -23,7 +23,7 @@ bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
chrono.workspace = true
clap = { workspace = true, features = ["derive", "env"] }
clap.workspace = true
compute_api.workspace = true
consumption_metrics.workspace = true
dashmap.workspace = true
@@ -60,7 +60,7 @@ prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
reqwest.workspace = true
reqwest-middleware = { workspace = true, features = ["json"] }
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
@@ -74,8 +74,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
smol_str.workspace = true
smallvec.workspace = true
socket2.workspace = true
strum.workspace = true
strum_macros.workspace = true
subtle.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
@@ -98,7 +96,6 @@ rustls-native-certs.workspace = true
x509-parser.workspace = true
postgres-protocol.workspace = true
redis.workspace = true
zerocopy.workspace = true
# jwt stuff
jose-jwa = "0.1.2"

View File

@@ -51,7 +51,7 @@ pub(super) async fn authenticate(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::password_failed(&*creds.user));
return Err(auth::AuthError::auth_failed(&*creds.user));
}
};

View File

@@ -9,14 +9,15 @@ use super::ComputeCredentialKeys;
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::control_plane::provider::NodeInfo;
use crate::control_plane::{self, CachedNodeInfo};
use crate::error::{ReportableError, UserFacingError};
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::stream::PqStream;
use crate::{auth, compute, waiters};
#[derive(Debug, Error)]
pub(crate) enum ConsoleRedirectError {
pub(crate) enum WebAuthError {
#[error(transparent)]
WaiterRegister(#[from] waiters::RegisterError),
@@ -32,13 +33,13 @@ pub struct ConsoleRedirectBackend {
console_uri: reqwest::Url,
}
impl UserFacingError for ConsoleRedirectError {
impl UserFacingError for WebAuthError {
fn to_string_client(&self) -> String {
"Internal error".to_string()
}
}
impl ReportableError for ConsoleRedirectError {
impl ReportableError for WebAuthError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
@@ -103,7 +104,7 @@ async fn authenticate(
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> {
ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
ctx.set_auth_method(crate::context::AuthMethod::Web);
// registering waiter can fail if we get unlucky with rng.
// just try again.
@@ -116,7 +117,7 @@ async fn authenticate(
}
};
let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
let span = info_span!("web", psql_session_id = &psql_session_id);
let greeting = hello_message(link_uri, &psql_session_id);
// Give user a URL to spawn a new database.
@@ -127,16 +128,14 @@ async fn authenticate(
.write_message(&Be::NoticeResponse(&greeting))
.await?;
// Wait for console response via control plane (see `mgmt`).
// Wait for web console response (see `mgmt`).
info!(parent: &span, "waiting for console's reply...");
let db_info = tokio::time::timeout(auth_config.console_redirect_confirmation_timeout, waiter)
let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
.await
.map_err(|_elapsed| {
auth::AuthError::confirmation_timeout(
auth_config.console_redirect_confirmation_timeout.into(),
)
auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
})?
.map_err(ConsoleRedirectError::from)?;
.map_err(WebAuthError::from)?;
if auth_config.ip_allowlist_check_enabled {
if let Some(allowed_ips) = &db_info.allowed_ips {

View File

@@ -46,7 +46,7 @@ pub(crate) async fn authenticate_cleartext(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::password_failed(&*info.user));
return Err(auth::AuthError::auth_failed(&*info.user));
}
};

View File

@@ -7,11 +7,8 @@ use arc_swap::ArcSwapOption;
use dashmap::DashMap;
use jose_jwk::crypto::KeyInfo;
use reqwest::{redirect, Client};
use reqwest_retry::policies::ExponentialBackoff;
use reqwest_retry::RetryTransientMiddleware;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer};
use serde_json::value::RawValue;
use signature::Verifier;
use thiserror::Error;
use tokio::time::Instant;
@@ -19,7 +16,7 @@ use tokio::time::Instant;
use crate::auth::backend::ComputeCredentialKeys;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::GetEndpointJwksError;
use crate::http::read_body_with_limit;
use crate::http::parse_json_body_with_limit;
use crate::intern::RoleNameInt;
use crate::types::{EndpointId, RoleName};
@@ -31,10 +28,6 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
const JWKS_USER_AGENT: &str = "neon-proxy";
const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5);
const JWKS_FETCH_RETRIES: u32 = 3;
/// How to get the JWT auth rules
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
fn fetch_auth_rules(
@@ -62,7 +55,7 @@ pub(crate) struct AuthRule {
}
pub struct JwkCache {
client: reqwest_middleware::ClientWithMiddleware,
client: reqwest::Client,
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
}
@@ -124,14 +117,6 @@ impl Default for JwkCacheEntryLock {
}
}
#[derive(Deserialize)]
struct JwkSet<'a> {
/// we parse into raw-value because not all keys in a JWKS are ones
/// we can parse directly, so we parse them lazily.
#[serde(borrow)]
keys: Vec<&'a RawValue>,
}
impl JwkCacheEntryLock {
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
JwkRenewalPermit::acquire_permit(self).await
@@ -145,7 +130,7 @@ impl JwkCacheEntryLock {
&self,
_permit: JwkRenewalPermit<'_>,
ctx: &RequestMonitoring,
client: &reqwest_middleware::ClientWithMiddleware,
client: &reqwest::Client,
endpoint: EndpointId,
auth_rules: &F,
) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -169,73 +154,22 @@ impl JwkCacheEntryLock {
let req = client.get(rule.jwks_url.clone());
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
match req.send().await.and_then(|r| {
r.error_for_status()
.map_err(reqwest_middleware::Error::Reqwest)
}) {
match req.send().await.and_then(|r| r.error_for_status()) {
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
// I expect these failures would be quite sparse.
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
Ok(r) => {
let resp: http::Response<reqwest::Body> = r.into();
let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
.await
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
resp.into_body(),
MAX_JWK_BODY_SIZE,
)
.await
{
Ok(bytes) => bytes,
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
continue;
}
};
match serde_json::from_slice::<JwkSet>(&bytes) {
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
}
Ok(jwks) => {
// size_of::<&RawValue>() == 16
// size_of::<jose_jwk::Jwk>() == 288
// better to not pre-allocate this as it might be pretty large - especially if it has many
// keys we don't want or need.
// trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
// this would consume 8MiB just like that!
let mut keys = vec![];
let mut failed = 0;
for key in jwks.keys {
match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
Ok(key) => {
// if `use` (called `cls` in rust) is specified to be something other than signing,
// we can skip storing it.
if key
.prm
.cls
.as_ref()
.is_some_and(|c| *c != jose_jwk::Class::Signing)
{
continue;
}
keys.push(key);
}
Err(e) => {
tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
failed += 1;
}
}
}
keys.shrink_to_fit();
if failed > 0 {
tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
}
if keys.is_empty() {
tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
continue;
}
let jwks = jose_jwk::JwkSet { keys };
key_sets.insert(
rule.id,
KeySet {
@@ -245,7 +179,7 @@ impl JwkCacheEntryLock {
},
);
}
};
}
}
}
}
@@ -262,7 +196,7 @@ impl JwkCacheEntryLock {
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
self: &Arc<Self>,
ctx: &RequestMonitoring,
client: &reqwest_middleware::ClientWithMiddleware,
client: &reqwest::Client,
endpoint: EndpointId,
fetch: &F,
) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -316,7 +250,7 @@ impl JwkCacheEntryLock {
self: &Arc<Self>,
ctx: &RequestMonitoring,
jwt: &str,
client: &reqwest_middleware::ClientWithMiddleware,
client: &reqwest::Client,
endpoint: EndpointId,
role_name: &RoleName,
fetch: &F,
@@ -435,19 +369,8 @@ impl Default for JwkCache {
let client = Client::builder()
.user_agent(JWKS_USER_AGENT)
.redirect(redirect::Policy::none())
.tls_built_in_native_certs(true)
.connect_timeout(JWKS_CONNECT_TIMEOUT)
.timeout(JWKS_FETCH_TIMEOUT)
.build()
.expect("client config should be valid");
// Retry up to 3 times with increasing intervals between attempts.
let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES);
let client = reqwest_middleware::ClientBuilder::new(client)
.with(RetryTransientMiddleware::new_with_policy(retry_policy))
.build();
.expect("using &str and standard redirect::Policy");
JwkCache {
client,
map: DashMap::default(),
@@ -1286,63 +1209,4 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
}
}
}
#[tokio::test]
async fn check_jwk_keycloak_regression() {
let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into());
let valid_jwk = serde_json::to_value(valid_jwk).unwrap();
// This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones.
// This is taken directly from keycloak.
let invalid_jwk = serde_json::json! {
{
"kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ",
"kty": "RSA",
"alg": "RSA-OAEP",
"use": "enc",
"n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw",
"e": "AQAB",
"x5c": [
"MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI="
],
"x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs",
"x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU"
}
};
let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }};
let jwks_addr = jwks_server(move |path| match path {
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
_ => None,
})
.await;
let role_name = RoleName::from("anonymous");
let role = RoleNameInt::from(&role_name);
let rules = vec![AuthRule {
id: "foo".to_owned(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: None,
role_names: vec![role],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let endpoint = EndpointId::from("ep");
let token = new_rsa_jwt("rs1".into(), rs);
jwk_cache
.check_jwt(
&RequestMonitoring::test(),
endpoint.clone(),
&role_name,
&fetch,
&token,
)
.await
.unwrap();
}
}

View File

@@ -9,7 +9,7 @@ use std::sync::Arc;
use std::time::Duration;
pub use console_redirect::ConsoleRedirectBackend;
pub(crate) use console_redirect::ConsoleRedirectError;
pub(crate) use console_redirect::WebAuthError;
use ipnet::{Ipv4Net, Ipv6Net};
use local::LocalBackend;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -21,11 +21,11 @@ use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserIn
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ControlPlaneClient;
use crate::control_plane::errors::GetAuthInfoError;
use crate::control_plane::{
self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
use crate::control_plane::provider::{
CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
};
use crate::control_plane::{self, Api, AuthSecret};
use crate::intern::EndpointIdInt;
use crate::metrics::Metrics;
use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -62,26 +62,42 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
/// backends which require them for the authentication process.
pub enum Backend<'a, T> {
/// Cloud API (V2).
ControlPlane(MaybeOwned<'a, ControlPlaneClient>, T),
ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
/// Local proxy uses configured auth credentials and does not wake compute
Local(MaybeOwned<'a, LocalBackend>),
}
#[cfg(test)]
pub(crate) trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
fn get_allowed_ips_and_secret(
&self,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>;
fn dyn_clone(&self) -> Box<dyn TestBackend>;
}
#[cfg(test)]
impl Clone for Box<dyn TestBackend> {
fn clone(&self) -> Self {
TestBackend::dyn_clone(&**self)
}
}
impl std::fmt::Display for Backend<'_, ()> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ControlPlane(api, ()) => match &**api {
ControlPlaneClient::Neon(endpoint) => fmt
.debug_tuple("ControlPlane::Neon")
ControlPlaneBackend::Management(endpoint) => fmt
.debug_tuple("ControlPlane::Management")
.field(&endpoint.url())
.finish(),
#[cfg(any(test, feature = "testing"))]
ControlPlaneClient::PostgresMock(endpoint) => fmt
ControlPlaneBackend::PostgresMock(endpoint) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&endpoint.url())
.finish(),
#[cfg(test)]
ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
},
Self::Local(_) => fmt.debug_tuple("Local").finish(),
}
@@ -266,7 +282,7 @@ impl AuthenticationConfig {
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
ctx: &RequestMonitoring,
api: &impl control_plane::ControlPlaneApi,
api: &impl control_plane::Api,
user_info: ComputeUserInfoMaybeEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
@@ -333,7 +349,7 @@ async fn auth_quirks(
{
Ok(keys) => Ok(keys),
Err(e) => {
if e.is_password_failed() {
if e.is_auth_failed() {
// The password could have been changed, so we invalidate the cache.
cached_entry.invalidate();
}
@@ -360,7 +376,7 @@ async fn authenticate_with_secret(
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::password_failed(&*info.user));
return Err(auth::AuthError::auth_failed(&*info.user));
}
};
@@ -483,12 +499,12 @@ mod tests {
use std::time::Duration;
use bytes::BytesMut;
use control_plane::AuthSecret;
use fallible_iterator::FallibleIterator;
use once_cell::sync::Lazy;
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
use postgres_protocol::message::backend::Message as PgMessage;
use postgres_protocol::message::frontend;
use provider::AuthSecret;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
use super::jwt::JwkCache;
@@ -497,7 +513,8 @@ mod tests {
use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret};
use crate::control_plane::{self, CachedNodeInfo};
use crate::proxy::NeonOptions;
use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
use crate::scram::threadpool::ThreadPool;
@@ -509,7 +526,7 @@ mod tests {
secret: AuthSecret,
}
impl control_plane::ControlPlaneApi for Auth {
impl control_plane::Api for Auth {
async fn get_role_secret(
&self,
_ctx: &RequestMonitoring,
@@ -560,7 +577,7 @@ mod tests {
ip_allowlist_check_enabled: true,
is_auth_broker: false,
accept_jwts: false,
console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
webauth_confirmation_timeout: std::time::Duration::from_secs(5),
});
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {

View File

@@ -21,7 +21,6 @@ pub(crate) use flow::*;
use thiserror::Error;
use tokio::time::error::Elapsed;
use crate::auth::backend::jwt::JwtError;
use crate::control_plane;
use crate::error::{ReportableError, UserFacingError};
@@ -32,7 +31,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
#[derive(Debug, Error)]
pub(crate) enum AuthError {
#[error(transparent)]
ConsoleRedirect(#[from] backend::ConsoleRedirectError),
Web(#[from] backend::WebAuthError),
#[error(transparent)]
GetAuthInfo(#[from] control_plane::errors::GetAuthInfoError),
@@ -56,7 +55,7 @@ pub(crate) enum AuthError {
MissingEndpointName,
#[error("password authentication failed for user '{0}'")]
PasswordFailed(Box<str>),
AuthFailed(Box<str>),
/// Errors produced by e.g. [`crate::stream::PqStream`].
#[error(transparent)]
@@ -77,9 +76,6 @@ pub(crate) enum AuthError {
#[error("Disconnected due to inactivity after {0}.")]
ConfirmationTimeout(humantime::Duration),
#[error(transparent)]
Jwt(#[from] JwtError),
}
impl AuthError {
@@ -87,8 +83,8 @@ impl AuthError {
AuthError::BadAuthMethod(name.into())
}
pub(crate) fn password_failed(user: impl Into<Box<str>>) -> Self {
AuthError::PasswordFailed(user.into())
pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
AuthError::AuthFailed(user.into())
}
pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
@@ -99,8 +95,8 @@ impl AuthError {
AuthError::TooManyConnections
}
pub(crate) fn is_password_failed(&self) -> bool {
matches!(self, AuthError::PasswordFailed(_))
pub(crate) fn is_auth_failed(&self) -> bool {
matches!(self, AuthError::AuthFailed(_))
}
pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
@@ -115,10 +111,10 @@ impl AuthError {
impl UserFacingError for AuthError {
fn to_string_client(&self) -> String {
match self {
Self::ConsoleRedirect(e) => e.to_string_client(),
Self::Web(e) => e.to_string_client(),
Self::GetAuthInfo(e) => e.to_string_client(),
Self::Sasl(e) => e.to_string_client(),
Self::PasswordFailed(_) => self.to_string(),
Self::AuthFailed(_) => self.to_string(),
Self::BadAuthMethod(_) => self.to_string(),
Self::MalformedPassword(_) => self.to_string(),
Self::MissingEndpointName => self.to_string(),
@@ -127,7 +123,6 @@ impl UserFacingError for AuthError {
Self::TooManyConnections => self.to_string(),
Self::UserTimeout(_) => self.to_string(),
Self::ConfirmationTimeout(_) => self.to_string(),
Self::Jwt(_) => self.to_string(),
}
}
}
@@ -135,10 +130,10 @@ impl UserFacingError for AuthError {
impl ReportableError for AuthError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::ConsoleRedirect(e) => e.get_error_kind(),
Self::Web(e) => e.get_error_kind(),
Self::GetAuthInfo(e) => e.get_error_kind(),
Self::Sasl(e) => e.get_error_kind(),
Self::PasswordFailed(_) => crate::error::ErrorKind::User,
Self::AuthFailed(_) => crate::error::ErrorKind::User,
Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
Self::MalformedPassword(_) => crate::error::ErrorKind::User,
Self::MissingEndpointName => crate::error::ErrorKind::User,
@@ -147,7 +142,6 @@ impl ReportableError for AuthError {
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::UserTimeout(_) => crate::error::ErrorKind::User,
Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
Self::Jwt(_) => crate::error::ErrorKind::User,
}
}
}

View File

@@ -281,7 +281,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
ip_allowlist_check_enabled: true,
is_auth_broker: false,
accept_jwts: true,
console_redirect_confirmation_timeout: Duration::ZERO,
webauth_confirmation_timeout: Duration::ZERO,
},
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
handshake_timeout: Duration::from_secs(10),

View File

@@ -13,10 +13,9 @@ use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring;
use proxy::metrics::{Metrics, ThreadPoolMetrics};
use proxy::protocol2::ConnectionInfo;
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
use proxy::stream::{PqStream, Stream};
use rustls::crypto::ring;
use rustls::crypto::aws_lc_rs;
use rustls::pki_types::PrivateKeyDer;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::net::TcpListener;
@@ -106,13 +105,14 @@ async fn main() -> anyhow::Result<()> {
let first_cert = cert_chain.first().context("missing certificate")?;
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
let tls_config =
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
.context("ring should support TLS1.2 and TLS1.3")?
.with_no_client_auth()
.with_single_cert(cert_chain, key)?
.into();
let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
aws_lc_rs::default_provider(),
))
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
.context("aws_lc_rs should support TLS1.2 and TLS1.3")?
.with_no_client_auth()
.with_single_cert(cert_chain, key)?
.into();
(tls_config, tls_server_end_point)
}
@@ -179,10 +179,7 @@ async fn task_main(
info!(%peer_addr, "serving");
let ctx = RequestMonitoring::new(
session_id,
ConnectionInfo {
addr: peer_addr,
extra: None,
},
peer_addr.ip(),
proxy::metrics::Protocol::SniRouter,
"sni",
);

View File

@@ -51,11 +51,11 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackendType {
#[value(name("console"), alias("cplane"))]
ControlPlane,
#[value(name("link"), alias("control-redirect"))]
ConsoleRedirect,
Console,
// clap only shows the name, not the alias, in usage text.
// TODO: swap name/alias and deprecate "link"
#[value(name("link"), alias("web"))]
Web,
#[cfg(feature = "testing")]
Postgres,
@@ -71,7 +71,7 @@ struct ProxyCliArgs {
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
#[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
#[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
auth_backend: AuthBackendType,
/// listen for management callback connection on ip:port
#[clap(short, long, default_value = "127.0.0.1:7000")]
@@ -82,7 +82,7 @@ struct ProxyCliArgs {
/// listen for incoming wss connections on ip:port
#[clap(long)]
wss: Option<String>,
/// redirect unauthenticated users to the given uri in case of console redirect auth
/// redirect unauthenticated users to the given uri in case of web auth
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
uri: String,
/// cloud API endpoint for authenticating users
@@ -92,14 +92,6 @@ struct ProxyCliArgs {
default_value = "http://localhost:3000/authenticate_proxy_request/"
)]
auth_endpoint: String,
/// JWT used to connect to control plane.
#[clap(
long,
value_name = "JWT",
default_value = "",
env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
)]
control_plane_token: Arc<str>,
/// if this is not local proxy, this toggles whether we accept jwt or passwords for http
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
is_auth_broker: bool,
@@ -231,7 +223,6 @@ struct ProxyCliArgs {
proxy_protocol_v2: ProxyProtocolV2,
/// Time the proxy waits for the webauth session to be confirmed by the control plane.
// TODO: rename to `console_redirect_confirmation_timeout`.
#[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
webauth_confirmation_timeout: std::time::Duration,
}
@@ -522,7 +513,7 @@ async fn main() -> anyhow::Result<()> {
}
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
match (redis_notifications_client, regional_redis_client.clone()) {
(None, None) => {}
(client1, client2) => {
@@ -668,7 +659,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
ip_allowlist_check_enabled: !args.is_private_access_proxy,
is_auth_broker: args.is_auth_broker,
accept_jwts: args.is_auth_broker,
console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
webauth_confirmation_timeout: args.webauth_confirmation_timeout,
};
let config = ProxyConfig {
@@ -699,7 +690,7 @@ fn build_auth_backend(
args: &ProxyCliArgs,
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
match &args.auth_backend {
AuthBackendType::ControlPlane => {
AuthBackendType::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
@@ -741,14 +732,13 @@ fn build_auth_backend(
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
let api = control_plane::client::neon::NeonControlPlaneClient::new(
let api = control_plane::provider::neon::Api::new(
endpoint,
args.control_plane_token.clone(),
caches,
locks,
wake_compute_endpoint_rate_limiter,
);
let api = control_plane::client::ControlPlaneClient::Neon(api);
let api = control_plane::provider::ControlPlaneBackend::Management(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
let config = Box::leak(Box::new(auth_backend));
@@ -759,11 +749,8 @@ fn build_auth_backend(
#[cfg(feature = "testing")]
AuthBackendType::Postgres => {
let url = args.auth_endpoint.parse()?;
let api = control_plane::client::mock::MockControlPlane::new(
url,
!args.is_private_access_proxy,
);
let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
@@ -772,7 +759,7 @@ fn build_auth_backend(
Ok(Either::Left(config))
}
AuthBackendType::ConsoleRedirect => {
AuthBackendType::Web => {
let url = args.uri.parse()?;
let backend = ConsoleRedirectBackend::new(url);

View File

@@ -1,12 +1,13 @@
use std::convert::Infallible;
use std::future::pending;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashSet;
use redis::streams::{StreamReadOptions, StreamReadReply};
use redis::{AsyncCommands, FromRedisValue, Value};
use serde::Deserialize;
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use tracing::info;
@@ -18,38 +19,23 @@ use crate::rate_limiter::GlobalRateLimiter;
use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use crate::types::EndpointId;
// TODO: this could be an enum, but events in Redis need to be fixed first.
// ProjectCreated was sent with type:branch_created. So we ignore type.
#[derive(Deserialize, Debug, Clone, PartialEq)]
struct ControlPlaneEvent {
#[derive(Deserialize, Debug, Clone)]
pub(crate) struct ControlPlaneEventKey {
endpoint_created: Option<EndpointCreated>,
branch_created: Option<BranchCreated>,
project_created: Option<ProjectCreated>,
#[serde(rename = "type")]
_type: Option<String>,
}
#[derive(Deserialize, Debug, Clone, PartialEq)]
#[derive(Deserialize, Debug, Clone)]
struct EndpointCreated {
endpoint_id: EndpointIdInt,
endpoint_id: String,
}
#[derive(Deserialize, Debug, Clone, PartialEq)]
#[derive(Deserialize, Debug, Clone)]
struct BranchCreated {
branch_id: BranchIdInt,
branch_id: String,
}
#[derive(Deserialize, Debug, Clone, PartialEq)]
#[derive(Deserialize, Debug, Clone)]
struct ProjectCreated {
project_id: ProjectIdInt,
}
impl TryFrom<&Value> for ControlPlaneEvent {
type Error = anyhow::Error;
fn try_from(value: &Value) -> Result<Self, Self::Error> {
let json = String::from_redis_value(value)?;
Ok(serde_json::from_str(&json)?)
}
project_id: String,
}
pub struct EndpointsCache {
@@ -74,80 +60,60 @@ impl EndpointsCache {
ready: AtomicBool::new(false),
}
}
pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
if !self.ready.load(Ordering::Acquire) {
// the endpoint cache is not yet fully initialised.
return true;
}
if !self.should_reject(endpoint) {
ctx.set_rejected(false);
let rejected = self.should_reject(endpoint);
ctx.set_rejected(rejected);
info!(?rejected, "check endpoint is valid, disabled cache");
// If cache is disabled, just collect the metrics and return or
// If the limiter allows, we don't need to check the cache.
if self.config.disable_cache || self.limiter.lock().await.check() {
return true;
}
// report that we might want to reject this endpoint
ctx.set_rejected(true);
// If cache is disabled, just collect the metrics and return.
if self.config.disable_cache {
return true;
}
// If the limiter allows, we can pretend like it's valid
// (incase it is, due to redis channel lag).
if self.limiter.lock().unwrap().check() {
return true;
}
// endpoint not found, and there's too much load.
false
!rejected
}
fn should_reject(&self, endpoint: &EndpointId) -> bool {
if endpoint.is_endpoint() {
let Some(endpoint) = EndpointIdInt::get(endpoint) else {
// if we haven't interned this endpoint, it's not in the cache.
return true;
};
!self.endpoints.contains(&endpoint)
!self.endpoints.contains(&EndpointIdInt::from(endpoint))
} else if endpoint.is_branch() {
let Some(branch) = BranchIdInt::get(endpoint) else {
// if we haven't interned this branch, it's not in the cache.
return true;
};
!self.branches.contains(&branch)
!self
.branches
.contains(&BranchIdInt::from(&endpoint.as_branch()))
} else {
let Some(project) = ProjectIdInt::get(endpoint) else {
// if we haven't interned this project, it's not in the cache.
return true;
};
!self.projects.contains(&project)
!self
.projects
.contains(&ProjectIdInt::from(&endpoint.as_project()))
}
}
fn insert_event(&self, event: ControlPlaneEvent) {
if let Some(endpoint_created) = event.endpoint_created {
self.endpoints.insert(endpoint_created.endpoint_id);
fn insert_event(&self, key: ControlPlaneEventKey) {
// Do not do normalization here, we expect the events to be normalized.
if let Some(endpoint_created) = key.endpoint_created {
self.endpoints
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::EndpointCreated);
} else if let Some(branch_created) = event.branch_created {
self.branches.insert(branch_created.branch_id);
}
if let Some(branch_created) = key.branch_created {
self.branches
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::BranchCreated);
} else if let Some(project_created) = event.project_created {
self.projects.insert(project_created.project_id);
}
if let Some(project_created) = key.project_created {
self.projects
.insert(ProjectIdInt::from(&project_created.project_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::ProjectCreated);
}
}
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
@@ -165,13 +131,12 @@ impl EndpointsCache {
}
if cancellation_token.is_cancelled() {
info!("cancellation token is cancelled, exiting");
// Maintenance tasks run forever. Sleep forever when canceled.
pending::<()>().await;
tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
// 1 week.
}
tokio::time::sleep(self.config.retry_interval).await;
}
}
async fn read_from_stream(
&self,
con: &mut ConnectionWithCredentialsProvider,
@@ -197,7 +162,10 @@ impl EndpointsCache {
)
.await
}
fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
let s: String = FromRedisValue::from_redis_value(value)?;
Ok(serde_json::from_str(&s)?)
}
async fn batch_read(
&self,
conn: &mut ConnectionWithCredentialsProvider,
@@ -228,25 +196,27 @@ impl EndpointsCache {
anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
}
let key = res.keys.pop().expect("Checked length above");
let len = key.ids.len();
for stream_id in key.ids {
let res = res.keys.pop().expect("Checked length above");
let len = res.ids.len();
for x in res.ids {
total += 1;
for value in stream_id.map.values() {
match value.try_into() {
Ok(event) => self.insert_event(event),
Err(err) => {
for (_, v) in x.map {
let key = match Self::parse_key_value(&v) {
Ok(x) => x,
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: &self.config.stream_name,
});
tracing::error!("error parsing value {value:?}: {err:?}");
tracing::error!("error parsing value {v:?}: {e:?}");
continue;
}
};
self.insert_event(key);
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);
}
*last_id = stream_id.id;
*last_id = x.id;
}
if return_when_finish && len <= self.config.default_batch_size {
break;
@@ -259,24 +229,11 @@ impl EndpointsCache {
#[cfg(test)]
mod tests {
use super::*;
use super::ControlPlaneEventKey;
#[test]
fn test_parse_control_plane_event() {
let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
assert_eq!(
serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
ControlPlaneEvent {
endpoint_created: Some(EndpointCreated {
endpoint_id: endpoint_id.into(),
}),
branch_created: None,
project_created: None,
_type: Some("endpoint_created".into()),
}
);
fn test() {
let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
serde_json::from_str::<ControlPlaneEventKey>(s).unwrap();
}
}

View File

@@ -8,7 +8,7 @@ use itertools::Itertools;
use once_cell::sync::OnceCell;
use pq_proto::StartupMessageParams;
use rustls::client::danger::ServerCertVerifier;
use rustls::crypto::ring;
use rustls::crypto::aws_lc_rs;
use rustls::pki_types::InvalidDnsNameError;
use thiserror::Error;
use tokio::net::TcpStream;
@@ -19,9 +19,9 @@ use tracing::{error, info, warn};
use crate::auth::parse_endpoint_param;
use crate::cancellation::CancelClosure;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::errors::WakeComputeError;
use crate::control_plane::messages::MetricsAuxInfo;
use crate::control_plane::provider::ApiLockError;
use crate::error::{ReportableError, UserFacingError};
use crate::metrics::{Metrics, NumDbConnectionsGuard};
use crate::proxy::neon_option;
@@ -135,13 +135,13 @@ impl ConnCfg {
/// Apply startup message params to the connection config.
pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
// Only set `user` if it's not present in the config.
// Console redirect auth flow takes username from the console's response.
// Web auth flow takes username from the console's response.
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
self.user(user);
}
// Only set `dbname` if it's not present in the config.
// Console redirect auth flow takes dbname from the console's response.
// Web auth flow takes dbname from the console's response.
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
self.dbname(dbname);
}
@@ -266,12 +266,12 @@ impl ConnCfg {
}
}
type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
pub(crate) struct PostgresConnection {
/// Socket connected to a compute node.
pub(crate) stream:
tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
tokio::net::TcpStream,
tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
>,
/// PostgreSQL connection parameters.
pub(crate) params: std::collections::HashMap<String, String>,
/// Query cancellation token.
@@ -298,9 +298,9 @@ impl ConnCfg {
let client_config = if allow_self_signed_compute {
// Allow all certificates for creating the connection
let verifier = Arc::new(AcceptEverythingVerifier);
rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("ring should support the default protocol versions")
.expect("aws_lc_rs should support the default protocol versions")
.dangerous()
.with_custom_certificate_verifier(verifier)
} else {
@@ -308,9 +308,9 @@ impl ConnCfg {
.get_or_try_init(load_certs)
.map_err(ConnectionError::TlsCertificateError)?
.clone();
rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("ring should support the default protocol versions")
.expect("aws_lc_rs should support the default protocol versions")
.with_root_certificates(root_store)
};
let client_config = client_config.with_no_client_auth();

View File

@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
use clap::ValueEnum;
use itertools::Itertools;
use remote_storage::RemoteStorageConfig;
use rustls::crypto::ring::{self, sign};
use rustls::crypto::aws_lc_rs::{self, sign};
use rustls::pki_types::{CertificateDer, PrivateKeyDer};
use sha2::{Digest, Sha256};
use tracing::{error, info};
@@ -78,7 +78,7 @@ pub struct AuthenticationConfig {
pub jwks_cache: JwkCache,
pub is_auth_broker: bool,
pub accept_jwts: bool,
pub console_redirect_confirmation_timeout: tokio::time::Duration,
pub webauth_confirmation_timeout: tokio::time::Duration,
}
impl TlsConfig {
@@ -127,9 +127,9 @@ pub fn configure_tls(
// allow TLS 1.2 to be compatible with older client libraries
let mut config =
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
.context("ring should support TLS1.2 and TLS1.3")?
.context("aws_lc_rs should support TLS1.2 and TLS1.3")?
.with_no_client_auth()
.with_cert_resolver(cert_resolver.clone());
@@ -271,7 +271,7 @@ impl CertResolver {
// auth-broker does not use SNI and instead uses the Neon-Connection-String header.
// Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
//
// Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
// Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string
// validation, so let's we can continue with any common-name
let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
s.to_string()
@@ -366,7 +366,7 @@ pub struct EndpointCacheConfig {
}
impl EndpointCacheConfig {
/// Default options for [`crate::control_plane::NodeInfoCache`].
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
/// Notice that by default the limiter is empty, which means that cache is disabled.
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
@@ -441,7 +441,7 @@ pub struct CacheOptions {
}
impl CacheOptions {
/// Default options for [`crate::control_plane::NodeInfoCache`].
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
/// Parse cache options passed via cmdline.
@@ -497,7 +497,7 @@ pub struct ProjectInfoCacheOptions {
}
impl ProjectInfoCacheOptions {
/// Default options for [`crate::control_plane::NodeInfoCache`].
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"size=10000,ttl=4m,max_roles=10,gc_interval=60m";
@@ -616,9 +616,9 @@ pub struct ConcurrencyLockOptions {
}
impl ConcurrencyLockOptions {
/// Default options for [`crate::control_plane::client::ApiLocks`].
/// Default options for [`crate::control_plane::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
/// Default options for [`crate::control_plane::client::ApiLocks`].
/// Default options for [`crate::control_plane::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
"shards=64,permits=100,epoch=10m,timeout=10ms";

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use futures::TryFutureExt;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, Instrument};
use tracing::{error, info, Instrument};
use crate::auth::backend::ConsoleRedirectBackend;
use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
@@ -11,7 +11,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::{Metrics, NumClientConnectionsGuard};
use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
use crate::protocol2::read_proxy_protocol;
use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
use crate::proxy::handshake::{handshake, HandshakeData};
use crate::proxy::passthrough::ProxyPassthrough;
@@ -49,7 +49,7 @@ pub async fn task_main(
let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler);
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
connections.spawn(async move {
let (socket, peer_addr) = match read_proxy_protocol(socket).await {
@@ -57,21 +57,16 @@ pub async fn task_main(
error!("per-client task finished with an error: {e:#}");
return;
}
// our load balancers will not send any more data. let's just exit immediately
Ok((_socket, ConnectHeader::Local)) => {
debug!("healthcheck received");
return;
}
Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
error!("missing required proxy protocol header");
return;
}
Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
error!("proxy protocol header not supported");
return;
}
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
Ok((socket, Some(addr))) => (socket, addr.ip()),
Ok((socket, None)) => (socket, peer_addr.ip()),
};
match socket.inner.set_nodelay(true) {

View File

@@ -19,7 +19,6 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
use crate::metrics::{
ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
};
use crate::protocol2::ConnectionInfo;
use crate::types::{DbName, EndpointId, RoleName};
pub mod parquet;
@@ -41,7 +40,7 @@ pub struct RequestMonitoring(
);
struct RequestMonitoringInner {
pub(crate) conn_info: ConnectionInfo,
pub(crate) peer_addr: IpAddr,
pub(crate) session_id: Uuid,
pub(crate) protocol: Protocol,
first_packet: chrono::DateTime<Utc>,
@@ -75,7 +74,7 @@ struct RequestMonitoringInner {
#[derive(Clone, Debug)]
pub(crate) enum AuthMethod {
// aka passwordless, fka link
ConsoleRedirect,
Web,
ScramSha256,
ScramSha256Plus,
Cleartext,
@@ -85,7 +84,7 @@ impl Clone for RequestMonitoring {
fn clone(&self) -> Self {
let inner = self.0.try_lock().expect("should not deadlock");
let new = RequestMonitoringInner {
conn_info: inner.conn_info.clone(),
peer_addr: inner.peer_addr,
session_id: inner.session_id,
protocol: inner.protocol,
first_packet: inner.first_packet,
@@ -118,7 +117,7 @@ impl Clone for RequestMonitoring {
impl RequestMonitoring {
pub fn new(
session_id: Uuid,
conn_info: ConnectionInfo,
peer_addr: IpAddr,
protocol: Protocol,
region: &'static str,
) -> Self {
@@ -126,13 +125,13 @@ impl RequestMonitoring {
"connect_request",
%protocol,
?session_id,
%conn_info,
%peer_addr,
ep = tracing::field::Empty,
role = tracing::field::Empty,
);
let inner = RequestMonitoringInner {
conn_info,
peer_addr,
session_id,
protocol,
first_packet: Utc::now(),
@@ -163,11 +162,7 @@ impl RequestMonitoring {
#[cfg(test)]
pub(crate) fn test() -> Self {
use std::net::SocketAddr;
let ip = IpAddr::from([127, 0, 0, 1]);
let addr = SocketAddr::new(ip, 5432);
let conn_info = ConnectionInfo { addr, extra: None };
RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
}
pub(crate) fn console_application_name(&self) -> String {
@@ -291,12 +286,7 @@ impl RequestMonitoring {
}
pub(crate) fn peer_addr(&self) -> IpAddr {
self.0
.try_lock()
.expect("should not deadlock")
.conn_info
.addr
.ip()
self.0.try_lock().expect("should not deadlock").peer_addr
}
pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
@@ -372,7 +362,7 @@ impl RequestMonitoringInner {
}
fn has_private_peer_addr(&self) -> bool {
match self.conn_info.addr.ip() {
match self.peer_addr {
IpAddr::V4(ip) => ip.is_private(),
IpAddr::V6(_) => false,
}

View File

@@ -121,7 +121,7 @@ impl From<&RequestMonitoringInner> for RequestData {
fn from(value: &RequestMonitoringInner) -> Self {
Self {
session_id: value.session_id,
peer_addr: value.conn_info.addr.ip().to_string(),
peer_addr: value.peer_addr.to_string(),
timestamp: value.first_packet.naive_utc(),
username: value.user.as_deref().map(String::from),
application_name: value.application.as_deref().map(String::from),
@@ -134,7 +134,7 @@ impl From<&RequestMonitoringInner> for RequestData {
.as_ref()
.and_then(|options| serde_json::to_string(&Options { options }).ok()),
auth_method: value.auth_method.as_ref().map(|x| match x {
super::AuthMethod::ConsoleRedirect => "console_redirect",
super::AuthMethod::Web => "web",
super::AuthMethod::ScramSha256 => "scram_sha_256",
super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
super::AuthMethod::Cleartext => "cleartext",

View File

@@ -1,281 +0,0 @@
#[cfg(any(test, feature = "testing"))]
pub mod mock;
pub mod neon;
use std::hash::Hash;
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashMap;
use tokio::time::Instant;
use tracing::info;
use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
use crate::auth::backend::ComputeUserInfo;
use crate::cache::endpoints::EndpointsCache;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
use crate::context::RequestMonitoring;
use crate::control_plane::{
errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
};
use crate::error::ReportableError;
use crate::metrics::ApiLockMetrics;
use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
use crate::types::EndpointId;
#[non_exhaustive]
#[derive(Clone)]
pub enum ControlPlaneClient {
/// Current Management API (V2).
Neon(neon::NeonControlPlaneClient),
/// Local mock control plane.
#[cfg(any(test, feature = "testing"))]
PostgresMock(mock::MockControlPlane),
/// Internal testing
#[cfg(test)]
#[allow(private_interfaces)]
Test(Box<dyn TestControlPlaneClient>),
}
impl ControlPlaneApi for ControlPlaneClient {
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
match self {
Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(_) => {
unreachable!("this function should never be called in the test backend")
}
}
}
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
match self {
Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.get_allowed_ips_and_secret(),
}
}
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
match self {
Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(test)]
Self::Test(_api) => Ok(vec![]),
}
}
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
match self {
Self::Neon(api) => api.wake_compute(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.wake_compute(),
}
}
}
#[cfg(test)]
pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
fn get_allowed_ips_and_secret(
&self,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
}
#[cfg(test)]
impl Clone for Box<dyn TestControlPlaneClient> {
fn clone(&self) -> Self {
TestControlPlaneClient::dyn_clone(&**self)
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub(crate) node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
}
impl ApiCaches {
pub fn new(
wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self {
Self {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiLocks<K> {
name: &'static str,
node_locks: DashMap<K, Arc<DynamicLimiter>>,
config: RateLimiterConfig,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum ApiLockError {
#[error("timeout acquiring resource permit")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
config: RateLimiterConfig,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> {
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
config,
timeout,
epoch,
metrics,
})
}
pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.config.initial_limit == 0 {
return Ok(WakeComputePermit {
permit: Token::disabled(),
});
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.metrics.semaphores_registered.inc();
DynamicLimiter::new(self.config)
})
.clone()
}
};
let permit = semaphore.acquire_timeout(self.timeout).await;
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
info!("acquired permit {:?}", now.elapsed().as_secs_f64());
Ok(WakeComputePermit { permit: permit? })
}
pub async fn garbage_collect_worker(&self) {
if self.config.initial_limit == 0 {
return;
}
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64);
timer.observe();
}
}
}
}
pub(crate) struct WakeComputePermit {
permit: Token,
}
impl WakeComputePermit {
pub(crate) fn should_check_cache(&self) -> bool {
!self.permit.is_disabled()
}
pub(crate) fn release(self, outcome: Outcome) {
self.permit.release(outcome);
}
pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
match res {
Ok(_) => self.release(Outcome::Success),
Err(_) => self.release(Outcome::Overload),
}
res
}
}
impl FetchAuthRules for ControlPlaneClient {
async fn fetch_auth_rules(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
self.get_endpoint_jwks(ctx, endpoint)
.await
.map_err(FetchAuthRulesError::GetEndpointJwks)
}
}

View File

@@ -1,216 +0,0 @@
use thiserror::Error;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason};
use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
use crate::proxy::retry::CouldRetry;
/// A go-to error message which doesn't leak any detail.
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
/// Common console API error.
#[derive(Debug, Error)]
pub(crate) enum ControlPlaneError {
/// Error returned by the console itself.
#[error("{REQUEST_FAILED} with {0}")]
Message(Box<ControlPlaneErrorMessage>),
/// Various IO errors like broken pipe or malformed payload.
#[error("{REQUEST_FAILED}: {0}")]
Transport(#[from] std::io::Error),
}
impl ControlPlaneError {
/// Returns HTTP status code if it's the reason for failure.
pub(crate) fn get_reason(&self) -> messages::Reason {
match self {
ControlPlaneError::Message(e) => e.get_reason(),
ControlPlaneError::Transport(_) => messages::Reason::Unknown,
}
}
}
impl UserFacingError for ControlPlaneError {
fn to_string_client(&self) -> String {
match self {
// To minimize risks, only select errors are forwarded to users.
ControlPlaneError::Message(c) => c.get_user_facing_message(),
ControlPlaneError::Transport(_) => REQUEST_FAILED.to_owned(),
}
}
}
impl ReportableError for ControlPlaneError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ControlPlaneError::Message(e) => match e.get_reason() {
Reason::RoleProtected => ErrorKind::User,
Reason::ResourceNotFound => ErrorKind::User,
Reason::ProjectNotFound => ErrorKind::User,
Reason::EndpointNotFound => ErrorKind::User,
Reason::BranchNotFound => ErrorKind::User,
Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
Reason::RunningOperations => ErrorKind::ControlPlane,
Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
Reason::Unknown => ErrorKind::ControlPlane,
},
ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
impl CouldRetry for ControlPlaneError {
fn could_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
Self::Message(e) => e.could_retry(),
}
}
}
impl From<reqwest::Error> for ControlPlaneError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()
}
}
impl From<reqwest_middleware::Error> for ControlPlaneError {
fn from(e: reqwest_middleware::Error) -> Self {
io_error(e).into()
}
}
#[derive(Debug, Error)]
pub(crate) enum GetAuthInfoError {
// We shouldn't include the actual secret here.
#[error("Console responded with a malformed auth secret")]
BadSecret,
#[error(transparent)]
ApiError(ControlPlaneError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ControlPlaneError>> From<E> for GetAuthInfoError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for GetAuthInfoError {
fn to_string_client(&self) -> String {
match self {
// We absolutely should not leak any secrets!
Self::BadSecret => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
}
}
}
impl ReportableError for GetAuthInfoError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
#[derive(Debug, Error)]
pub(crate) enum WakeComputeError {
#[error("Console responded with a malformed compute address: {0}")]
BadComputeAddress(Box<str>),
#[error(transparent)]
ControlPlane(ControlPlaneError),
#[error("Too many connections attempts")]
TooManyConnections,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ControlPlaneError>> From<E> for WakeComputeError {
fn from(e: E) -> Self {
Self::ControlPlane(e.into())
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
match self {
// We shouldn't show user the address even if it's broken.
// Besides, user is unlikely to care about this detail.
Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, control plane might return a meaningful error.
Self::ControlPlane(e) => e.to_string_client(),
Self::TooManyConnections => self.to_string(),
Self::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
impl ReportableError for WakeComputeError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
Self::ControlPlane(e) => e.get_error_kind(),
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
impl CouldRetry for WakeComputeError {
fn could_retry(&self) -> bool {
match self {
Self::BadComputeAddress(_) => false,
Self::ControlPlane(e) => e.could_retry(),
Self::TooManyConnections => false,
Self::TooManyConnectionAttempts(_) => false,
}
}
}
#[derive(Debug, Error)]
pub enum GetEndpointJwksError {
#[error("endpoint not found")]
EndpointNotFound,
#[error("failed to build control plane request: {0}")]
RequestBuild(#[source] reqwest::Error),
#[error("failed to send control plane request: {0}")]
RequestExecute(#[source] reqwest_middleware::Error),
#[error(transparent)]
ControlPlane(#[from] ControlPlaneError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TokioPostgres(#[from] tokio_postgres::Error),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
ParseUrl(#[from] url::ParseError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TaskJoin(#[from] tokio::task::JoinError),
}

View File

@@ -10,14 +10,14 @@ use crate::proxy::retry::CouldRetry;
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
#[derive(Debug, Deserialize, Clone)]
pub(crate) struct ControlPlaneErrorMessage {
pub(crate) struct ControlPlaneError {
pub(crate) error: Box<str>,
#[serde(skip)]
pub(crate) http_status_code: http::StatusCode,
pub(crate) status: Option<Status>,
}
impl ControlPlaneErrorMessage {
impl ControlPlaneError {
pub(crate) fn get_reason(&self) -> Reason {
self.status
.as_ref()
@@ -26,7 +26,7 @@ impl ControlPlaneErrorMessage {
}
pub(crate) fn get_user_facing_message(&self) -> String {
use super::errors::REQUEST_FAILED;
use super::provider::errors::REQUEST_FAILED;
self.status
.as_ref()
.and_then(|s| s.details.user_facing_message.as_ref())
@@ -51,7 +51,7 @@ impl ControlPlaneErrorMessage {
}
}
impl Display for ControlPlaneErrorMessage {
impl Display for ControlPlaneError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let msg: &str = self
.status
@@ -62,7 +62,7 @@ impl Display for ControlPlaneErrorMessage {
}
}
impl CouldRetry for ControlPlaneErrorMessage {
impl CouldRetry for ControlPlaneError {
fn could_retry(&self) -> bool {
// If the error message does not have a status,
// the error is unknown and probably should not retry automatically
@@ -245,7 +245,7 @@ pub(crate) struct WakeCompute {
pub(crate) aux: MetricsAuxInfo,
}
/// Async response which concludes the console redirect auth flow.
/// Async response which concludes the web auth flow.
/// Also known as `kickResponse` in the console.
#[derive(Debug, Deserialize)]
pub(crate) struct KickSession<'a> {

View File

@@ -24,8 +24,8 @@ pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), wai
CPLANE_WAITERS.notify(psql_session_id, msg)
}
/// Management API listener task.
/// It spawns management response handlers needed for the console redirect auth flow.
/// Console management API listener task.
/// It spawns console response handlers needed for the web auth.
pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
scopeguard::defer! {
info!("mgmt has shut down");
@@ -43,13 +43,13 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
tokio::task::spawn(
async move {
info!("serving a new management API connection");
info!("serving a new console management API connection");
// these might be long running connections, have a separate logging for cancelling
// on shutdown and other ways of stopping.
let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
let _e = span.entered();
info!("management API task cancelled");
info!("console management API task cancelled");
});
if let Err(e) = handle_connection(socket).await {

View File

@@ -5,137 +5,18 @@
pub mod messages;
/// Wrappers for console APIs and their mocks.
pub mod client;
pub(crate) mod errors;
use std::sync::Arc;
use std::time::Duration;
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::auth::IpPattern;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::cache::{Cached, TimedLru};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
use crate::intern::ProjectIdInt;
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, scram};
pub mod provider;
pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
/// Various cache-related types.
pub mod caches {
pub use super::client::ApiCaches;
pub use super::provider::ApiCaches;
}
/// Various cache-related types.
pub mod locks {
pub use super::client::ApiLocks;
pub use super::provider::ApiLocks;
}
/// Console's management API.
pub mod mgmt;
/// Auth secret which is managed by the cloud.
#[derive(Clone, Eq, PartialEq, Debug)]
pub(crate) enum AuthSecret {
#[cfg(any(test, feature = "testing"))]
/// Md5 hash of user's password.
Md5([u8; 16]),
/// [SCRAM](crate::scram) authentication info.
Scram(scram::ServerSecret),
}
#[derive(Default)]
pub(crate) struct AuthInfo {
pub(crate) secret: Option<AuthSecret>,
/// List of IP addresses allowed for the autorization.
pub(crate) allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub(crate) project_id: Option<ProjectIdInt>,
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
#[derive(Clone)]
pub(crate) struct NodeInfo {
/// Compute node connection params.
/// It's sad that we have to clone this, but this will improve
/// once we migrate to a bespoke connection logic.
pub(crate) config: compute::ConnCfg,
/// Labels for proxy's metrics.
pub(crate) aux: MetricsAuxInfo,
/// Whether we should accept self-signed certificates (for testing)
pub(crate) allow_self_signed_compute: bool,
}
impl NodeInfo {
pub(crate) async fn connect(
&self,
ctx: &RequestMonitoring,
timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config
.connect(
ctx,
self.allow_self_signed_compute,
self.aux.clone(),
timeout,
)
.await
}
pub(crate) fn reuse_settings(&mut self, other: Self) {
self.allow_self_signed_compute = other.allow_self_signed_compute;
self.config.reuse_password(other.config);
}
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
match keys {
#[cfg(any(test, feature = "testing"))]
ComputeCredentialKeys::Password(password) => self.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
};
}
}
pub(crate) type NodeInfoCache =
TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
pub(crate) trait ControlPlaneApi {
/// Get the client's auth secret for authentication.
/// Returns option because user not found situation is special.
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}

View File

@@ -9,17 +9,16 @@ use tokio_postgres::config::SslMode;
use tokio_postgres::Client;
use tracing::{error, info, info_span, warn, Instrument};
use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::ComputeUserInfo;
use crate::auth::IpPattern;
use crate::cache::Cached;
use crate::context::RequestMonitoring;
use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
use crate::control_plane::errors::{
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
};
use crate::control_plane::errors::GetEndpointJwksError;
use crate::control_plane::messages::MetricsAuxInfo;
use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
use crate::error::io_error;
use crate::intern::RoleNameInt;
use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -32,25 +31,25 @@ enum MockApiError {
PasswordNotSet(tokio_postgres::Error),
}
impl From<MockApiError> for ControlPlaneError {
impl From<MockApiError> for ApiError {
fn from(e: MockApiError) -> Self {
io_error(e).into()
}
}
impl From<tokio_postgres::Error> for ControlPlaneError {
impl From<tokio_postgres::Error> for ApiError {
fn from(e: tokio_postgres::Error) -> Self {
io_error(e).into()
}
}
#[derive(Clone)]
pub struct MockControlPlane {
pub struct Api {
endpoint: ApiUrl,
ip_allowlist_check_enabled: bool,
}
impl MockControlPlane {
impl Api {
pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
Self {
endpoint,
@@ -202,7 +201,7 @@ async fn get_execute_postgres_query(
Ok(Some(entry))
}
impl super::ControlPlaneApi for MockControlPlane {
impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,

View File

@@ -0,0 +1,588 @@
#[cfg(any(test, feature = "testing"))]
pub mod mock;
pub mod neon;
use std::hash::Hash;
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashMap;
use tokio::time::Instant;
use tracing::info;
use super::messages::{ControlPlaneError, MetricsAuxInfo};
use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::auth::IpPattern;
use crate::cache::endpoints::EndpointsCache;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::cache::{Cached, TimedLru};
use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::intern::ProjectIdInt;
use crate::metrics::ApiLockMetrics;
use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, scram};
pub(crate) mod errors {
use thiserror::Error;
use super::ApiLockError;
use crate::control_plane::messages::{self, ControlPlaneError, Reason};
use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
use crate::proxy::retry::CouldRetry;
/// A go-to error message which doesn't leak any detail.
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
/// Common console API error.
#[derive(Debug, Error)]
pub(crate) enum ApiError {
/// Error returned by the console itself.
#[error("{REQUEST_FAILED} with {0}")]
ControlPlane(Box<ControlPlaneError>),
/// Various IO errors like broken pipe or malformed payload.
#[error("{REQUEST_FAILED}: {0}")]
Transport(#[from] std::io::Error),
}
impl ApiError {
/// Returns HTTP status code if it's the reason for failure.
pub(crate) fn get_reason(&self) -> messages::Reason {
match self {
ApiError::ControlPlane(e) => e.get_reason(),
ApiError::Transport(_) => messages::Reason::Unknown,
}
}
}
impl UserFacingError for ApiError {
fn to_string_client(&self) -> String {
match self {
// To minimize risks, only select errors are forwarded to users.
ApiError::ControlPlane(c) => c.get_user_facing_message(),
ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
}
}
}
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::ControlPlane(e) => match e.get_reason() {
Reason::RoleProtected => ErrorKind::User,
Reason::ResourceNotFound => ErrorKind::User,
Reason::ProjectNotFound => ErrorKind::User,
Reason::EndpointNotFound => ErrorKind::User,
Reason::BranchNotFound => ErrorKind::User,
Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
Reason::RunningOperations => ErrorKind::ControlPlane,
Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
Reason::Unknown => ErrorKind::ControlPlane,
},
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
impl CouldRetry for ApiError {
fn could_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
Self::ControlPlane(e) => e.could_retry(),
}
}
}
impl From<reqwest::Error> for ApiError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()
}
}
impl From<reqwest_middleware::Error> for ApiError {
fn from(e: reqwest_middleware::Error) -> Self {
io_error(e).into()
}
}
#[derive(Debug, Error)]
pub(crate) enum GetAuthInfoError {
// We shouldn't include the actual secret here.
#[error("Console responded with a malformed auth secret")]
BadSecret,
#[error(transparent)]
ApiError(ApiError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ApiError>> From<E> for GetAuthInfoError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for GetAuthInfoError {
fn to_string_client(&self) -> String {
match self {
// We absolutely should not leak any secrets!
Self::BadSecret => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
}
}
}
impl ReportableError for GetAuthInfoError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
#[derive(Debug, Error)]
pub(crate) enum WakeComputeError {
#[error("Console responded with a malformed compute address: {0}")]
BadComputeAddress(Box<str>),
#[error(transparent)]
ApiError(ApiError),
#[error("Too many connections attempts")]
TooManyConnections,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ApiError>> From<E> for WakeComputeError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
match self {
// We shouldn't show user the address even if it's broken.
// Besides, user is unlikely to care about this detail.
Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
Self::TooManyConnections => self.to_string(),
Self::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
impl ReportableError for WakeComputeError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
Self::ApiError(e) => e.get_error_kind(),
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
impl CouldRetry for WakeComputeError {
fn could_retry(&self) -> bool {
match self {
Self::BadComputeAddress(_) => false,
Self::ApiError(e) => e.could_retry(),
Self::TooManyConnections => false,
Self::TooManyConnectionAttempts(_) => false,
}
}
}
#[derive(Debug, Error)]
pub enum GetEndpointJwksError {
#[error("endpoint not found")]
EndpointNotFound,
#[error("failed to build control plane request: {0}")]
RequestBuild(#[source] reqwest::Error),
#[error("failed to send control plane request: {0}")]
RequestExecute(#[source] reqwest_middleware::Error),
#[error(transparent)]
ControlPlane(#[from] ApiError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TokioPostgres(#[from] tokio_postgres::Error),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
ParseUrl(#[from] url::ParseError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TaskJoin(#[from] tokio::task::JoinError),
}
}
/// Auth secret which is managed by the cloud.
#[derive(Clone, Eq, PartialEq, Debug)]
pub(crate) enum AuthSecret {
#[cfg(any(test, feature = "testing"))]
/// Md5 hash of user's password.
Md5([u8; 16]),
/// [SCRAM](crate::scram) authentication info.
Scram(scram::ServerSecret),
}
#[derive(Default)]
pub(crate) struct AuthInfo {
pub(crate) secret: Option<AuthSecret>,
/// List of IP addresses allowed for the autorization.
pub(crate) allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub(crate) project_id: Option<ProjectIdInt>,
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
#[derive(Clone)]
pub(crate) struct NodeInfo {
/// Compute node connection params.
/// It's sad that we have to clone this, but this will improve
/// once we migrate to a bespoke connection logic.
pub(crate) config: compute::ConnCfg,
/// Labels for proxy's metrics.
pub(crate) aux: MetricsAuxInfo,
/// Whether we should accept self-signed certificates (for testing)
pub(crate) allow_self_signed_compute: bool,
}
impl NodeInfo {
pub(crate) async fn connect(
&self,
ctx: &RequestMonitoring,
timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config
.connect(
ctx,
self.allow_self_signed_compute,
self.aux.clone(),
timeout,
)
.await
}
pub(crate) fn reuse_settings(&mut self, other: Self) {
self.allow_self_signed_compute = other.allow_self_signed_compute;
self.config.reuse_password(other.config);
}
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
match keys {
#[cfg(any(test, feature = "testing"))]
ComputeCredentialKeys::Password(password) => self.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
};
}
}
pub(crate) type NodeInfoCache =
TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneError>>>;
pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
pub(crate) trait Api {
/// Get the client's auth secret for authentication.
/// Returns option because user not found situation is special.
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}
#[non_exhaustive]
#[derive(Clone)]
pub enum ControlPlaneBackend {
/// Current Management API (V2).
Management(neon::Api),
/// Local mock control plane.
#[cfg(any(test, feature = "testing"))]
PostgresMock(mock::Api),
/// Internal testing
#[cfg(test)]
#[allow(private_interfaces)]
Test(Box<dyn crate::auth::backend::TestBackend>),
}
impl Api for ControlPlaneBackend {
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
match self {
Self::Management(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(_) => {
unreachable!("this function should never be called in the test backend")
}
}
}
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
match self {
Self::Management(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.get_allowed_ips_and_secret(),
}
}
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
match self {
Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(test)]
Self::Test(_api) => Ok(vec![]),
}
}
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
match self {
Self::Management(api) => api.wake_compute(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.wake_compute(),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub(crate) node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
}
impl ApiCaches {
pub fn new(
wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self {
Self {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiLocks<K> {
name: &'static str,
node_locks: DashMap<K, Arc<DynamicLimiter>>,
config: RateLimiterConfig,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum ApiLockError {
#[error("timeout acquiring resource permit")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
config: RateLimiterConfig,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> {
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
config,
timeout,
epoch,
metrics,
})
}
pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.config.initial_limit == 0 {
return Ok(WakeComputePermit {
permit: Token::disabled(),
});
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.metrics.semaphores_registered.inc();
DynamicLimiter::new(self.config)
})
.clone()
}
};
let permit = semaphore.acquire_timeout(self.timeout).await;
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
info!("acquired permit {:?}", now.elapsed().as_secs_f64());
Ok(WakeComputePermit { permit: permit? })
}
pub async fn garbage_collect_worker(&self) {
if self.config.initial_limit == 0 {
return;
}
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64);
timer.observe();
}
}
}
}
pub(crate) struct WakeComputePermit {
permit: Token,
}
impl WakeComputePermit {
pub(crate) fn should_check_cache(&self) -> bool {
!self.permit.is_disabled()
}
pub(crate) fn release(self, outcome: Outcome) {
self.permit.release(outcome);
}
pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
match res {
Ok(_) => self.release(Outcome::Success),
Err(_) => self.release(Outcome::Overload),
}
res
}
}
impl FetchAuthRules for ControlPlaneBackend {
async fn fetch_auth_rules(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
self.get_endpoint_jwks(ctx, endpoint)
.await
.map_err(FetchAuthRulesError::GetEndpointJwks)
}
}

View File

@@ -10,20 +10,18 @@ use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{debug, info, info_span, warn, Instrument};
use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute};
use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
use super::{
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
NodeInfo,
};
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::ComputeUserInfo;
use crate::cache::Cached;
use crate::context::RequestMonitoring;
use crate::control_plane::caches::ApiCaches;
use crate::control_plane::errors::{
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::errors::GetEndpointJwksError;
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
use crate::control_plane::{
AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
};
use crate::metrics::{CacheOutcome, Metrics};
use crate::rate_limiter::WakeComputeRateLimiter;
use crate::types::{EndpointCacheKey, EndpointId};
@@ -32,7 +30,7 @@ use crate::{compute, http, scram};
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
#[derive(Clone)]
pub struct NeonControlPlaneClient {
pub struct Api {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
@@ -41,15 +39,17 @@ pub struct NeonControlPlaneClient {
jwt: Arc<str>,
}
impl NeonControlPlaneClient {
impl Api {
/// Construct an API object containing the auth parameters.
pub fn new(
endpoint: http::Endpoint,
jwt: Arc<str>,
caches: &'static ApiCaches,
locks: &'static ApiLocks<EndpointCacheKey>,
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
) -> Self {
let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
.unwrap_or_default()
.into();
Self {
endpoint,
caches,
@@ -72,6 +72,7 @@ impl NeonControlPlaneClient {
.caches
.endpoints_cache
.is_valid(ctx, &user_info.endpoint.normalize())
.await
{
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
@@ -144,6 +145,7 @@ impl NeonControlPlaneClient {
.caches
.endpoints_cache
.is_valid(ctx, &endpoint.normalize())
.await
{
return Err(GetEndpointJwksError::EndpointNotFound);
}
@@ -254,7 +256,7 @@ impl NeonControlPlaneClient {
}
}
impl super::ControlPlaneApi for NeonControlPlaneClient {
impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,
@@ -354,7 +356,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
let (cached, info) = cached.take_value();
let info = info.map_err(|c| {
info!(key = &*key, "found cached wake_compute error");
WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
})?;
debug!(key = &*key, "found cached compute node info");
@@ -401,11 +403,9 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
Ok(cached.map(|()| node))
}
Err(err) => match err {
WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
WakeComputeError::ApiError(ApiError::ControlPlane(err)) => {
let Some(status) = &err.status else {
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
};
let reason = status
@@ -415,9 +415,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
// if we can retry this error, do not cache it.
if reason.can_retry() {
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
}
// at this point, we should only have quota errors.
@@ -432,9 +430,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
Duration::from_secs(30),
);
Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)))
Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)))
}
err => return Err(err),
},
@@ -445,7 +441,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
/// Parse http response body, taking status code into account.
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
response: http::Response,
) -> Result<T, ControlPlaneError> {
) -> Result<T, ApiError> {
let status = response.status();
if status.is_success() {
// We shouldn't log raw body because it may contain secrets.
@@ -460,7 +456,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
// as the fact that the request itself has failed.
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
warn!("failed to parse error body: {e}");
ControlPlaneErrorMessage {
ControlPlaneError {
error: "reason unclear (malformed error message)".into(),
http_status_code: status,
status: None,
@@ -469,7 +465,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
body.http_status_code = status;
warn!("console responded with an error ({status}): {body:?}");
Err(ControlPlaneError::Message(Box::new(body)))
Err(ApiError::ControlPlane(Box::new(body)))
}
fn parse_host_port(input: &str) -> Option<(&str, u16)> {

View File

@@ -6,6 +6,7 @@ pub mod health_server;
use std::time::Duration;
use anyhow::bail;
use bytes::Bytes;
use http::Method;
use http_body_util::BodyExt;
@@ -15,7 +16,7 @@ use reqwest_middleware::RequestBuilder;
pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
pub(crate) use reqwest_retry::policies::ExponentialBackoff;
pub(crate) use reqwest_retry::RetryTransientMiddleware;
use thiserror::Error;
use serde::de::DeserializeOwned;
use crate::metrics::{ConsoleRequest, Metrics};
use crate::url::ApiUrl;
@@ -121,19 +122,10 @@ impl Endpoint {
}
}
#[derive(Error, Debug)]
pub(crate) enum ReadBodyError {
#[error("Content length exceeds limit of {limit} bytes")]
BodyTooLarge { limit: usize },
#[error(transparent)]
Read(#[from] reqwest::Error),
}
pub(crate) async fn read_body_with_limit(
pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
limit: usize,
) -> Result<Vec<u8>, ReadBodyError> {
) -> anyhow::Result<D> {
// We could use `b.limited().collect().await.to_bytes()` here
// but this ends up being slightly more efficient as far as I can tell.
@@ -141,20 +133,20 @@ pub(crate) async fn read_body_with_limit(
// in reqwest, this value is influenced by the Content-Length header.
let lower_bound = match usize::try_from(b.size_hint().lower()) {
Ok(bound) if bound <= limit => bound,
_ => return Err(ReadBodyError::BodyTooLarge { limit }),
_ => bail!("Content length exceeds limit of {limit} bytes"),
};
let mut bytes = Vec::with_capacity(lower_bound);
while let Some(frame) = b.frame().await.transpose()? {
if let Ok(data) = frame.into_data() {
if bytes.len() + data.len() > limit {
return Err(ReadBodyError::BodyTooLarge { limit });
bail!("Content length exceeds limit of {limit} bytes")
}
bytes.extend_from_slice(&data);
}
}
Ok(bytes)
Ok(serde_json::from_slice::<D>(&bytes)?)
}
#[cfg(test)]

View File

@@ -1,6 +1,12 @@
// rustc lints/lint groups
// https://doc.rust-lang.org/rustc/lints/groups.html
#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
#![deny(
deprecated,
future_incompatible,
let_underscore,
nonstandard_style,
rust_2024_compatibility
)]
#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
// List of denied lints from the clippy::restriction group.
// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction

View File

@@ -18,7 +18,6 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::INFO.into())
.from_env_lossy()
.add_directive("aws_config=info".parse().unwrap())
.add_directive("azure_core::policies::transport=off".parse().unwrap());
let fmt_layer = tracing_subscriber::fmt::layer()

View File

@@ -1,17 +1,13 @@
//! Proxy Protocol V2 implementation
//! Compatible with <https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt>
use core::fmt;
use std::io;
use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr};
use std::net::SocketAddr;
use std::pin::Pin;
use std::task::{Context, Poll};
use bytes::{Buf, Bytes, BytesMut};
use bytes::BytesMut;
use pin_project_lite::pin_project;
use strum_macros::FromRepr;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use zerocopy::{FromBytes, FromZeroes};
pin_project! {
/// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
@@ -58,78 +54,102 @@ impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
}
/// Proxy Protocol Version 2 Header
const SIGNATURE: [u8; 12] = [
const HEADER: [u8; 12] = [
0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
];
const LOCAL_V2: u8 = 0x20;
const PROXY_V2: u8 = 0x21;
const TCP_OVER_IPV4: u8 = 0x11;
const UDP_OVER_IPV4: u8 = 0x12;
const TCP_OVER_IPV6: u8 = 0x21;
const UDP_OVER_IPV6: u8 = 0x22;
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct ConnectionInfo {
pub addr: SocketAddr,
pub extra: Option<ConnectionInfoExtra>,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum ConnectHeader {
Missing,
Local,
Proxy(ConnectionInfo),
}
impl fmt::Display for ConnectionInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match &self.extra {
None => self.addr.ip().fmt(f),
Some(ConnectionInfoExtra::Aws { vpce_id }) => {
write!(f, "vpce_id[{vpce_id:?}]:addr[{}]", self.addr.ip())
}
Some(ConnectionInfoExtra::Azure { link_id }) => {
write!(f, "link_id[{link_id}]:addr[{}]", self.addr.ip())
}
}
}
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum ConnectionInfoExtra {
Aws { vpce_id: Bytes },
Azure { link_id: u32 },
}
pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
mut read: T,
) -> std::io::Result<(ChainRW<T>, ConnectHeader)> {
) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
let mut buf = BytesMut::with_capacity(128);
let header = loop {
while buf.len() < 16 {
let bytes_read = read.read_buf(&mut buf).await?;
// exit for bad header signature
let len = usize::min(buf.len(), SIGNATURE.len());
if buf[..len] != SIGNATURE[..len] {
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
// exit for bad header
let len = usize::min(buf.len(), HEADER.len());
if buf[..len] != HEADER[..len] {
return Ok((ChainRW { inner: read, buf }, None));
}
// if no more bytes available then exit
if bytes_read == 0 {
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
return Ok((ChainRW { inner: read, buf }, None));
};
}
// check if we have enough bytes to continue
if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
break header;
let header = buf.split_to(16);
// The next byte (the 13th one) is the protocol version and command.
// The highest four bits contains the version. As of this specification, it must
// always be sent as \x2 and the receiver must only accept this value.
let vc = header[12];
let version = vc >> 4;
let command = vc & 0b1111;
if version != 2 {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol version. expected version 2",
));
}
match command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
0 => {}
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
1 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol command. expected local (0) or proxy (1)",
))
}
};
let remaining_length = usize::from(header.len.get());
// The 14th byte contains the transport protocol and address family. The highest 4
// bits contain the address family, the lowest 4 bits contain the protocol.
let ft = header[13];
let address_length = match ft {
// - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
// - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
0x11 | 0x12 => 12,
// - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
// - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
0x21 | 0x22 => 36,
// unspecified or unix stream. ignore the addresses
_ => 0,
};
while buf.len() < remaining_length {
// The 15th and 16th bytes is the address length in bytes in network endian order.
// It is used so that the receiver knows how many address bytes to skip even when
// it does not implement the presented protocol. Thus the length of the protocol
// header in bytes is always exactly 16 + this value. When a sender presents a
// LOCAL connection, it should not present any address so it sets this field to
// zero. Receivers MUST always consider this field to skip the appropriate number
// of bytes and must not assume zero is presented for LOCAL connections. When a
// receiver accepts an incoming connection showing an UNSPEC address family or
// protocol, it may or may not decide to log the address information if present.
let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
if remaining_length < address_length {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol length. not enough to fit requested IP addresses",
));
}
drop(header);
while buf.len() < remaining_length as usize {
if read.read_buf(&mut buf).await? == 0 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
@@ -137,145 +157,29 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
));
}
}
let payload = buf.split_to(remaining_length);
let res = process_proxy_payload(header, payload)?;
Ok((ChainRW { inner: read, buf }, res))
}
fn process_proxy_payload(
header: ProxyProtocolV2Header,
mut payload: BytesMut,
) -> std::io::Result<ConnectHeader> {
match header.version_and_command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
LOCAL_V2 => return Ok(ConnectHeader::Local),
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
PROXY_V2 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
#[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
_ => return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
header.version_and_command
),
)),
// Starting from the 17th byte, addresses are presented in network byte order.
// The address order is always the same :
// - source layer 3 address in network byte order
// - destination layer 3 address in network byte order
// - source layer 4 address if any, in network byte order (port)
// - destination layer 4 address if any, in network byte order (port)
let addresses = buf.split_to(remaining_length as usize);
let socket = match address_length {
12 => {
let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
}
36 => {
let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
}
_ => None,
};
let size_err =
"invalid proxy protocol length. payload not large enough to fit requested IP addresses";
let addr = match header.protocol_and_family {
TCP_OVER_IPV4 | UDP_OVER_IPV4 => {
let addr = payload
.try_get::<ProxyProtocolV2HeaderV4>()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
}
TCP_OVER_IPV6 | UDP_OVER_IPV6 => {
let addr = payload
.try_get::<ProxyProtocolV2HeaderV6>()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
}
// unspecified or unix stream. ignore the addresses
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol address family/transport protocol.",
))
}
};
let mut extra = None;
while let Some(mut tlv) = read_tlv(&mut payload) {
match Pp2Kind::from_repr(tlv.kind) {
Some(Pp2Kind::Aws) => {
if tlv.value.is_empty() {
tracing::warn!("invalid aws tlv: no subtype");
}
let subtype = tlv.value.get_u8();
match Pp2AwsType::from_repr(subtype) {
Some(Pp2AwsType::VpceId) => {
extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value });
}
None => {
tracing::warn!("unknown aws tlv: subtype={subtype}");
}
}
}
Some(Pp2Kind::Azure) => {
if tlv.value.is_empty() {
tracing::warn!("invalid azure tlv: no subtype");
}
let subtype = tlv.value.get_u8();
match Pp2AzureType::from_repr(subtype) {
Some(Pp2AzureType::PrivateEndpointLinkId) => {
if tlv.value.len() != 4 {
tracing::warn!("invalid azure link_id: {:?}", tlv.value);
}
extra = Some(ConnectionInfoExtra::Azure {
link_id: tlv.value.get_u32_le(),
});
}
None => {
tracing::warn!("unknown azure tlv: subtype={subtype}");
}
}
}
Some(kind) => {
tracing::debug!("unused tlv[{kind:?}]: {:?}", tlv.value);
}
None => {
tracing::debug!("unknown tlv: {tlv:?}");
}
}
}
Ok(ConnectHeader::Proxy(ConnectionInfo { addr, extra }))
}
#[derive(FromRepr, Debug, Copy, Clone)]
#[repr(u8)]
enum Pp2Kind {
// The following are defined by https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt
// we don't use these but it would be interesting to know what's available
Alpn = 0x01,
Authority = 0x02,
Crc32C = 0x03,
Noop = 0x04,
UniqueId = 0x05,
Ssl = 0x20,
NetNs = 0x30,
/// <https://docs.aws.amazon.com/elasticloadbalancing/latest/network/edit-target-group-attributes.html#proxy-protocol>
Aws = 0xEA,
/// <https://learn.microsoft.com/en-us/azure/private-link/private-link-service-overview#getting-connection-information-using-tcp-proxy-v2>
Azure = 0xEE,
}
#[derive(FromRepr, Debug, Copy, Clone)]
#[repr(u8)]
enum Pp2AwsType {
VpceId = 0x01,
}
#[derive(FromRepr, Debug, Copy, Clone)]
#[repr(u8)]
enum Pp2AzureType {
PrivateEndpointLinkId = 0x01,
Ok((ChainRW { inner: read, buf }, socket))
}
impl<T: AsyncRead> AsyncRead for ChainRW<T> {
@@ -312,100 +216,15 @@ impl<T: AsyncRead> ChainRW<T> {
}
}
#[derive(Debug)]
struct Tlv {
kind: u8,
value: Bytes,
}
fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
let tlv_header = b.try_get::<TlvHeader>()?;
let len = usize::from(tlv_header.len.get());
if b.len() < len {
return None;
}
Some(Tlv {
kind: tlv_header.kind,
value: b.split_to(len).freeze(),
})
}
trait BufExt: Sized {
fn try_get<T: FromBytes>(&mut self) -> Option<T>;
}
impl BufExt for BytesMut {
fn try_get<T: FromBytes>(&mut self) -> Option<T> {
let res = T::read_from_prefix(self)?;
self.advance(size_of::<T>());
Some(res)
}
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2Header {
signature: [u8; 12],
version_and_command: u8,
protocol_and_family: u8,
len: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2HeaderV4 {
src_addr: NetworkEndianIpv4,
dst_addr: NetworkEndianIpv4,
src_port: zerocopy::byteorder::network_endian::U16,
dst_port: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2HeaderV6 {
src_addr: NetworkEndianIpv6,
dst_addr: NetworkEndianIpv6,
src_port: zerocopy::byteorder::network_endian::U16,
dst_port: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct TlvHeader {
kind: u8,
len: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(transparent)]
struct NetworkEndianIpv4(zerocopy::byteorder::network_endian::U32);
impl NetworkEndianIpv4 {
#[inline]
fn get(self) -> Ipv4Addr {
Ipv4Addr::from_bits(self.0.get())
}
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(transparent)]
struct NetworkEndianIpv6(zerocopy::byteorder::network_endian::U128);
impl NetworkEndianIpv6 {
#[inline]
fn get(self) -> Ipv6Addr {
Ipv6Addr::from_bits(self.0.get())
}
}
#[cfg(test)]
mod tests {
use tokio::io::AsyncReadExt;
use crate::protocol2::{
read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6,
};
use crate::protocol2::read_proxy_protocol;
#[tokio::test]
async fn test_ipv4() {
let header = super::SIGNATURE
let header = super::HEADER
// Proxy command, IPV4 | TCP
.chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
// 12 + 3 bytes
@@ -423,7 +242,7 @@ mod tests {
let extra_data = [0x55; 256];
let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
@@ -431,18 +250,14 @@ mod tests {
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into());
assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
}
#[tokio::test]
async fn test_ipv6() {
let header = super::SIGNATURE
let header = super::HEADER
// Proxy command, IPV6 | UDP
.chain([PROXY_V2, UDP_OVER_IPV6].as_slice())
.chain([(2 << 4) | 1, (2 << 4) | 2].as_slice())
// 36 + 3 bytes
.chain([0, 39].as_slice())
// src ip
@@ -458,7 +273,7 @@ mod tests {
let extra_data = [0x55; 256];
let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
@@ -466,13 +281,9 @@ mod tests {
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(
info.addr,
([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
addr,
Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
);
}
@@ -480,35 +291,34 @@ mod tests {
async fn test_invalid() {
let data = [0x55; 256];
let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(info, ConnectHeader::Missing);
assert_eq!(addr, None);
}
#[tokio::test]
async fn test_short() {
let data = [0x55; 10];
let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(info, ConnectHeader::Missing);
assert_eq!(addr, None);
}
#[tokio::test]
async fn test_large_tlv() {
let tlv = vec![0x55; 32768];
let tlv_len = (tlv.len() as u16).to_be_bytes();
let len = (12 + 3 + tlv.len() as u16).to_be_bytes();
let len = (12 + tlv.len() as u16).to_be_bytes();
let header = super::SIGNATURE
let header = super::HEADER
// Proxy command, Inet << 4 | Stream
.chain([PROXY_V2, TCP_OVER_IPV4].as_slice())
.chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
// 12 + 3 bytes
.chain(len.as_slice())
// src ip
@@ -520,13 +330,11 @@ mod tests {
// dst port
.chain([1, 1].as_slice())
// TLV
.chain([255].as_slice())
.chain(tlv_len.as_slice())
.chain(tlv.as_slice());
let extra_data = [0xaa; 256];
let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
@@ -534,31 +342,6 @@ mod tests {
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into());
}
#[tokio::test]
async fn test_local() {
let len = 0u16.to_be_bytes();
let header = super::SIGNATURE
.chain([LOCAL_V2, 0x00].as_slice())
.chain(len.as_slice());
let extra_data = [0xaa; 256];
let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
let ConnectHeader::Local = info else { panic!() };
assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
}
}

View File

@@ -19,7 +19,7 @@ use smol_str::{format_smolstr, SmolStr};
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn, Instrument};
use tracing::{error, info, warn, Instrument};
use self::connect_compute::{connect_to_compute, TcpMechanism};
use self::passthrough::ProxyPassthrough;
@@ -28,7 +28,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::{Metrics, NumClientConnectionsGuard};
use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
use crate::protocol2::read_proxy_protocol;
use crate::proxy::handshake::{handshake, HandshakeData};
use crate::rate_limiter::EndpointRateLimiter;
use crate::stream::{PqStream, Stream};
@@ -83,30 +83,25 @@ pub async fn task_main(
let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler);
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
connections.spawn(async move {
let (socket, conn_info) = match read_proxy_protocol(socket).await {
let (socket, peer_addr) = match read_proxy_protocol(socket).await {
Err(e) => {
warn!("per-client task finished with an error: {e:#}");
return;
}
// our load balancers will not send any more data. let's just exit immediately
Ok((_socket, ConnectHeader::Local)) => {
debug!("healthcheck received");
return;
}
Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
warn!("missing required proxy protocol header");
return;
}
Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
warn!("proxy protocol header not supported");
return;
}
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
Ok((socket, Some(addr))) => (socket, addr.ip()),
Ok((socket, None)) => (socket, peer_addr.ip()),
};
match socket.inner.set_nodelay(true) {
@@ -119,7 +114,7 @@ pub async fn task_main(
let ctx = RequestMonitoring::new(
session_id,
conn_info,
peer_addr,
crate::metrics::Protocol::Tcp,
&config.region,
);

View File

@@ -9,25 +9,24 @@ use async_trait::async_trait;
use http::StatusCode;
use retry::{retry_after, ShouldRetryWakeCompute};
use rstest::rstest;
use rustls::crypto::ring;
use rustls::crypto::aws_lc_rs;
use rustls::pki_types;
use tokio::io::DuplexStream;
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::{MakeTlsConnect, NoTls};
use tokio_postgres_rustls::MakeRustlsConnect;
use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
use super::connect_compute::ConnectMechanism;
use super::retry::CouldRetry;
use super::*;
use crate::auth::backend::{
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned,
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
};
use crate::config::{CertResolver, RetryConfig};
use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
use crate::control_plane::{
self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
use crate::control_plane::provider::{
CachedAllowedIps, CachedRoleSecret, ControlPlaneBackend, NodeInfoCache,
};
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
use crate::types::{BranchId, EndpointId, ProjectId};
use crate::{sasl, scram};
@@ -70,12 +69,19 @@ struct ClientConfig<'a> {
hostname: &'a str,
}
type TlsConnect<S> = <MakeRustlsConnect as MakeTlsConnect<S>>::TlsConnect;
impl ClientConfig<'_> {
fn make_tls_connect(self) -> anyhow::Result<TlsConnect<DuplexStream>> {
fn make_tls_connect<S: AsyncRead + AsyncWrite + Unpin + Send + 'static>(
self,
) -> anyhow::Result<
impl tokio_postgres::tls::TlsConnect<
S,
Error = impl std::fmt::Debug + use<S>,
Future = impl Send + use<S>,
Stream = RustlsStream<S>,
> + use<S>,
> {
let mut mk = MakeRustlsConnect::new(self.config);
let tls = MakeTlsConnect::<DuplexStream>::make_tls_connect(&mut mk, self.hostname)?;
let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
Ok(tls)
}
}
@@ -89,9 +95,9 @@ fn generate_tls_config<'a>(
let tls_config = {
let config =
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.context("ring should support the default protocol versions")?
.context("aws_lc_rs should support the default protocol versions")?
.with_no_client_auth()
.with_single_cert(vec![cert.clone()], key.clone_key())?
.into();
@@ -110,9 +116,9 @@ fn generate_tls_config<'a>(
let client_config = {
let config =
rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.context("ring should support the default protocol versions")?
.context("aws_lc_rs should support the default protocol versions")?
.with_root_certificates({
let mut store = rustls::RootCertStore::empty();
store.add(ca)?;
@@ -490,7 +496,7 @@ impl ConnectMechanism for TestConnectMechanism {
fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
}
impl TestControlPlaneClient for TestConnectMechanism {
impl TestBackend for TestConnectMechanism {
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
let mut counter = self.counter.lock().unwrap();
let action = self.sequence[*counter];
@@ -498,19 +504,18 @@ impl TestControlPlaneClient for TestConnectMechanism {
match action {
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
ConnectAction::WakeFail => {
let err = control_plane::errors::ControlPlaneError::Message(Box::new(
ControlPlaneErrorMessage {
let err =
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
http_status_code: StatusCode::BAD_REQUEST,
error: "TEST".into(),
status: None,
},
));
}));
assert!(!err.could_retry());
Err(control_plane::errors::WakeComputeError::ControlPlane(err))
Err(control_plane::errors::WakeComputeError::ApiError(err))
}
ConnectAction::WakeRetry => {
let err = control_plane::errors::ControlPlaneError::Message(Box::new(
ControlPlaneErrorMessage {
let err =
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
http_status_code: StatusCode::BAD_REQUEST,
error: "TEST".into(),
status: Some(Status {
@@ -524,10 +529,9 @@ impl TestControlPlaneClient for TestConnectMechanism {
user_facing_message: None,
},
}),
},
));
}));
assert!(err.could_retry());
Err(control_plane::errors::WakeComputeError::ControlPlane(err))
Err(control_plane::errors::WakeComputeError::ApiError(err))
}
x => panic!("expecting action {x:?}, wake_compute is called instead"),
}
@@ -540,7 +544,7 @@ impl TestControlPlaneClient for TestConnectMechanism {
unimplemented!("not used in tests")
}
fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient> {
fn dyn_clone(&self) -> Box<dyn TestBackend> {
Box::new(self.clone())
}
}
@@ -564,7 +568,7 @@ fn helper_create_connect_info(
mechanism: &TestConnectMechanism,
) -> auth::Backend<'static, ComputeCredentials> {
let user_info = auth::Backend::ControlPlane(
MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
ComputeCredentials {
info: ComputeUserInfo {
endpoint: "endpoint".into(),

View File

@@ -4,7 +4,7 @@ use super::connect_compute::ComputeConnectBackend;
use crate::config::RetryConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::WakeComputeError;
use crate::control_plane::CachedNodeInfo;
use crate::control_plane::provider::CachedNodeInfo;
use crate::error::ReportableError;
use crate::metrics::{
ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,

View File

@@ -14,7 +14,7 @@ use tracing::{debug, info};
use super::conn_pool::poll_client;
use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
use super::http_conn_pool::{self, poll_http2_client, Send};
use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
use crate::auth::backend::local::StaticAuthRules;
use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
@@ -24,9 +24,9 @@ use crate::compute_ctl::{
};
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::provider::ApiLockError;
use crate::control_plane::CachedNodeInfo;
use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::intern::EndpointIdInt;
@@ -81,7 +81,7 @@ impl PoolingBackend {
None => {
// If we don't have an authentication secret, for the http flow we can just return an error.
info!("authentication info not found");
return Err(AuthError::password_failed(&*user_info.user));
return Err(AuthError::auth_failed(&*user_info.user));
}
};
let ep = EndpointIdInt::from(&user_info.endpoint);
@@ -99,7 +99,7 @@ impl PoolingBackend {
}
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
Err(AuthError::password_failed(&*user_info.user))
Err(AuthError::auth_failed(&*user_info.user))
}
};
res.map(|key| ComputeCredentials {
@@ -126,7 +126,8 @@ impl PoolingBackend {
&**console,
&jwt,
)
.await?;
.await
.map_err(|e| AuthError::auth_failed(e.to_string()))?;
Ok(ComputeCredentials {
info: user_info.clone(),
@@ -145,7 +146,8 @@ impl PoolingBackend {
&StaticAuthRules,
&jwt,
)
.await?;
.await
.map_err(|e| AuthError::auth_failed(e.to_string()))?;
Ok(ComputeCredentials {
info: user_info.clone(),
@@ -205,7 +207,7 @@ impl PoolingBackend {
conn_info: ConnInfo,
) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
info!("pool: looking for an existing connection");
if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) {
if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
return Ok(client);
}
@@ -248,7 +250,7 @@ impl PoolingBackend {
&self,
ctx: &RequestMonitoring,
conn_info: ConnInfo,
) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
return Ok(client);
}

View File

@@ -18,9 +18,7 @@ use {
std::{sync::atomic, time::Duration},
};
use super::conn_pool_lib::{
Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
};
use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::MetricsAuxInfo;
use crate::metrics::Metrics;
@@ -154,30 +152,53 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
}
.instrument(span));
let inner = ClientInnerCommon {
let inner = ClientInnerRemote {
inner: client,
session: tx,
cancel,
aux,
conn_id,
data: ClientDataEnum::Remote(ClientDataRemote {
session: tx,
cancel,
}),
};
Client::new(inner, conn_info, pool_clone)
}
pub(crate) struct ClientDataRemote {
pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
inner: C,
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
}
impl ClientDataRemote {
pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
impl<C: ClientInnerExt> ClientInnerRemote<C> {
pub(crate) fn inner_mut(&mut self) -> &mut C {
&mut self.inner
}
pub(crate) fn inner(&self) -> &C {
&self.inner
}
pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
&mut self.session
}
pub fn cancel(&mut self) {
pub(crate) fn aux(&self) -> &MetricsAuxInfo {
&self.aux
}
pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
self.conn_id
}
pub(crate) fn is_closed(&self) -> bool {
self.inner.is_closed()
}
}
impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
fn drop(&mut self) {
// on client drop, tell the conn to shut down
self.cancel.cancel();
}
}
@@ -207,13 +228,15 @@ mod tests {
}
}
fn create_inner() -> ClientInnerCommon<MockClient> {
fn create_inner() -> ClientInnerRemote<MockClient> {
create_inner_with(MockClient::new(false))
}
fn create_inner_with(client: MockClient) -> ClientInnerCommon<MockClient> {
ClientInnerCommon {
fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
ClientInnerRemote {
inner: client,
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
cancel: CancellationToken::new(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
@@ -221,10 +244,6 @@ mod tests {
cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
},
conn_id: uuid::Uuid::new_v4(),
data: ClientDataEnum::Remote(ClientDataRemote {
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
cancel: CancellationToken::new(),
}),
}
}
@@ -261,7 +280,7 @@ mod tests {
{
let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
assert_eq!(0, pool.get_global_connections_count());
client.inner().1.discard();
client.inner_mut().1.discard();
// Discard should not add the connection from the pool.
assert_eq!(0, pool.get_global_connections_count());
}

View File

@@ -11,12 +11,10 @@ use tokio_postgres::ReadyForQueryStatus;
use tracing::{debug, info, Span};
use super::backend::HttpConnError;
use super::conn_pool::ClientDataRemote;
use super::http_conn_pool::ClientDataHttp;
use super::local_conn_pool::ClientDataLocal;
use super::conn_pool::ClientInnerRemote;
use crate::auth::backend::ComputeUserInfo;
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::control_plane::messages::ColdStartInfo;
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
use crate::types::{DbName, EndpointCacheKey, RoleName};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
@@ -43,46 +41,8 @@ impl ConnInfo {
}
}
pub(crate) enum ClientDataEnum {
Remote(ClientDataRemote),
Local(ClientDataLocal),
#[allow(dead_code)]
Http(ClientDataHttp),
}
pub(crate) struct ClientInnerCommon<C: ClientInnerExt> {
pub(crate) inner: C,
pub(crate) aux: MetricsAuxInfo,
pub(crate) conn_id: uuid::Uuid,
pub(crate) data: ClientDataEnum, // custom client data like session, key, jti
}
impl<C: ClientInnerExt> Drop for ClientInnerCommon<C> {
fn drop(&mut self) {
match &mut self.data {
ClientDataEnum::Remote(remote_data) => {
remote_data.cancel();
}
ClientDataEnum::Local(local_data) => {
local_data.cancel();
}
ClientDataEnum::Http(_http_data) => (),
}
}
}
impl<C: ClientInnerExt> ClientInnerCommon<C> {
pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
self.conn_id
}
pub(crate) fn get_data(&mut self) -> &mut ClientDataEnum {
&mut self.data
}
}
pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
pub(crate) conn: ClientInnerCommon<C>,
pub(crate) conn: ClientInnerRemote<C>,
pub(crate) _last_access: std::time::Instant,
}
@@ -95,33 +55,10 @@ pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
_guard: HttpEndpointPoolsGuard<'static>,
global_connections_count: Arc<AtomicUsize>,
global_pool_size_max_conns: usize,
pool_name: String,
}
impl<C: ClientInnerExt> EndpointConnPool<C> {
pub(crate) fn new(
hmap: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
tconns: usize,
max_conns_per_endpoint: usize,
global_connections_count: Arc<AtomicUsize>,
max_total_conns: usize,
pname: String,
) -> Self {
Self {
pools: hmap,
total_conns: tconns,
max_conns: max_conns_per_endpoint,
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
global_connections_count,
global_pool_size_max_conns: max_total_conns,
pool_name: pname,
}
}
pub(crate) fn get_conn_entry(
&mut self,
db_user: (DbName, RoleName),
) -> Option<ConnPoolEntry<C>> {
fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
let Self {
pools,
total_conns,
@@ -147,10 +84,9 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.get_conns().len();
pool.get_conns()
.retain(|conn| conn.conn.get_conn_id() != conn_id);
let new_len = pool.get_conns().len();
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
let new_len = pool.conns.len();
let removed = old_len - new_len;
if removed > 0 {
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
@@ -167,26 +103,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
}
}
pub(crate) fn get_name(&self) -> &str {
&self.pool_name
}
pub(crate) fn get_pool(&self, db_user: (DbName, RoleName)) -> Option<&DbUserConnPool<C>> {
self.pools.get(&db_user)
}
pub(crate) fn get_pool_mut(
&mut self,
db_user: (DbName, RoleName),
) -> Option<&mut DbUserConnPool<C>> {
self.pools.get_mut(&db_user)
}
pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerCommon<C>) {
pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
let conn_id = client.get_conn_id();
let pool_name = pool.read().get_name().to_string();
if client.inner.is_closed() {
info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name);
if client.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return;
}
@@ -197,7 +118,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
.load(atomic::Ordering::Relaxed)
>= global_max_conn
{
info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name);
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
return;
}
@@ -209,13 +130,13 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
if pool.total_conns < pool.max_conns {
let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
pool_entries.get_conns().push(ConnPoolEntry {
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.get_conns().len();
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
pool.global_connections_count
@@ -232,9 +153,9 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
// do logging outside of the mutex
if returned {
info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
}
}
@@ -255,39 +176,19 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
pub(crate) conns: Vec<ConnPoolEntry<C>>,
pub(crate) initialized: Option<bool>, // a bit ugly, exists only for local pools
}
impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
fn default() -> Self {
Self {
conns: Vec::new(),
initialized: None,
}
Self { conns: Vec::new() }
}
}
pub(crate) trait DbUserConn<C: ClientInnerExt>: Default {
fn set_initialized(&mut self);
fn is_initialized(&self) -> bool;
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize;
fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize);
fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>>;
}
impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
fn set_initialized(&mut self) {
self.initialized = Some(true);
}
fn is_initialized(&self) -> bool {
self.initialized.unwrap_or(false)
}
impl<C: ClientInnerExt> DbUserConnPool<C> {
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.inner.is_closed());
self.conns.retain(|conn| !conn.conn.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
@@ -295,7 +196,10 @@ impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
removed
}
fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize) {
pub(crate) fn get_conn_entry(
&mut self,
conns: &mut usize,
) -> (Option<ConnPoolEntry<C>>, usize) {
let mut removed = self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
@@ -311,10 +215,6 @@ impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
(conn, removed)
}
fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>> {
&mut self.conns
}
}
pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
@@ -378,60 +278,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
self.config.pool_options.idle_timeout
}
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
let mut client: Option<ClientInnerCommon<C>> = None;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
return Ok(None);
};
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
if let Some(entry) = endpoint_pool
.write()
.get_conn_entry(conn_info.db_and_user())
{
client = Some(entry.conn);
}
let endpoint_pool = Arc::downgrade(&endpoint_pool);
// ok return cached connection if found and establish a new one otherwise
if let Some(mut client) = client {
if client.inner.is_closed() {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner.get_process_id()),
);
info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
match client.get_data() {
ClientDataEnum::Local(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Remote(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Http(_) => (),
}
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
}
Ok(None)
}
pub(crate) fn shutdown(&self) {
// drops all strong references to endpoint-pools
self.global_pool.clear();
@@ -528,7 +374,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
global_connections_count: self.global_connections_count.clone(),
global_pool_size_max_conns: self.config.pool_options.max_total_conns,
pool_name: String::from("remote"),
}));
// find or create a pool for this endpoint
@@ -555,23 +400,55 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
pool
}
}
pub(crate) struct Client<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInnerCommon<C>>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
}
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
let mut client: Option<ClientInnerRemote<C>> = None;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
return Ok(None);
};
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
if let Some(entry) = endpoint_pool
.write()
.get_conn_entry(conn_info.db_and_user())
{
client = Some(entry.conn);
}
let endpoint_pool = Arc::downgrade(&endpoint_pool);
// ok return cached connection if found and establish a new one otherwise
if let Some(mut client) = client {
if client.is_closed() {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner().get_process_id()),
);
info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
client.session().send(ctx.session_id())?;
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
}
Ok(None)
}
}
impl<C: ClientInnerExt> Client<C> {
pub(crate) fn new(
inner: ClientInnerCommon<C>,
inner: ClientInnerRemote<C>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
) -> Self {
@@ -583,18 +460,7 @@ impl<C: ClientInnerExt> Client<C> {
}
}
pub(crate) fn client_inner(&mut self) -> (&mut ClientInnerCommon<C>, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner_m = inner.as_mut().expect("client inner should not be removed");
(inner_m, Discard { conn_info, pool })
}
pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
let Self {
inner,
pool,
@@ -602,11 +468,12 @@ impl<C: ClientInnerExt> Client<C> {
span: _,
} = self;
let inner = inner.as_mut().expect("client inner should not be removed");
(&mut inner.inner, Discard { conn_info, pool })
let inner_ref = inner.inner_mut();
(inner_ref, Discard { conn_info, pool })
}
pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
let aux = &self.inner.as_ref().unwrap().aux();
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
@@ -631,6 +498,13 @@ impl<C: ClientInnerExt> Client<C> {
}
}
pub(crate) struct Client<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInnerRemote<C>>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
}
impl<C: ClientInnerExt> Drop for Client<C> {
fn drop(&mut self) {
if let Some(drop) = self.do_drop() {
@@ -643,11 +517,10 @@ impl<C: ClientInnerExt> Deref for Client<C> {
type Target = C;
fn deref(&self) -> &Self::Target {
&self
.inner
self.inner
.as_ref()
.expect("client inner should not be removed")
.inner
.inner()
}
}
@@ -666,6 +539,11 @@ impl ClientInnerExt for tokio_postgres::Client {
}
}
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
}
impl<C: ClientInnerExt> Discard<'_, C> {
pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
let conn_info = &self.conn_info;

View File

@@ -10,7 +10,6 @@ use rand::Rng;
use tokio::net::TcpStream;
use tracing::{debug, error, info, info_span, Instrument};
use super::backend::HttpConnError;
use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
@@ -29,8 +28,6 @@ pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
aux: MetricsAuxInfo,
}
pub(crate) struct ClientDataHttp();
// Per-endpoint connection pool
// Number of open connections is limited by the `max_conns_per_endpoint`.
pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
@@ -209,22 +206,14 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
}
}
#[expect(unused_results)]
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
let result: Result<Option<Client<C>>, HttpConnError>;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
result = Ok(None);
return result;
};
) -> Option<Client<C>> {
let endpoint = conn_info.endpoint_cache_key()?;
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
let Some(client) = endpoint_pool.write().get_conn_entry() else {
result = Ok(None);
return result;
};
let client = endpoint_pool.write().get_conn_entry()?;
tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
info!(
@@ -233,7 +222,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
);
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
Ok(Some(Client::new(client.conn, client.aux)))
Some(Client::new(client.conn, client.aux))
}
fn get_or_create_endpoint_pool(
@@ -305,11 +294,6 @@ pub(crate) fn poll_http2_client(
conn_id,
aux: aux.clone(),
});
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.inc();
Arc::downgrade(&pool)
}
@@ -322,7 +306,7 @@ pub(crate) fn poll_http2_client(
let res = connection.await;
match res {
Ok(()) => info!("connection closed"),
Err(e) => error!(%session_id, "connection error: {e:?}"),
Err(e) => error!(%session_id, "connection error: {}", e),
}
// remove from connection pool

View File

@@ -11,8 +11,7 @@
use std::collections::HashMap;
use std::pin::pin;
use std::sync::atomic::AtomicUsize;
use std::sync::Arc;
use std::sync::{Arc, Weak};
use std::task::{ready, Poll};
use std::time::Duration;
@@ -27,42 +26,177 @@ use signature::Signer;
use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::types::ToSql;
use tokio_postgres::{AsyncMessage, Socket};
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
use tracing::{error, info, info_span, warn, Instrument, Span};
use super::backend::HttpConnError;
use super::conn_pool_lib::{
Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
EndpointConnPool,
};
use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::Metrics;
use crate::types::{DbName, RoleName};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
pub(crate) const EXT_NAME: &str = "pg_session_jwt";
pub(crate) const EXT_VERSION: &str = "0.1.2";
pub(crate) const EXT_SCHEMA: &str = "auth";
pub(crate) struct ClientDataLocal {
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
key: SigningKey,
jti: u64,
struct ConnPoolEntry<C: ClientInnerExt> {
conn: ClientInner<C>,
_last_access: std::time::Instant,
}
impl ClientDataLocal {
pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
&mut self.session
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
// Number of open connections is limited by the `max_conns_per_endpoint`.
pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
total_conns: usize,
max_conns: usize,
global_pool_size_max_conns: usize,
}
impl<C: ClientInnerExt> EndpointConnPool<C> {
fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
let Self {
pools, total_conns, ..
} = self;
pools
.get_mut(&db_user)
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
}
pub fn cancel(&mut self) {
self.cancel.cancel();
fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
let Self {
pools, total_conns, ..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
let new_len = pool.conns.len();
let removed = old_len - new_len;
if removed > 0 {
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
}
*total_conns -= removed;
removed > 0
} else {
false
}
}
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
let conn_id = client.conn_id;
if client.is_closed() {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because connection is closed");
return;
}
let global_max_conn = pool.read().global_pool_size_max_conns;
if pool.read().total_conns >= global_max_conn {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full");
return;
}
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < pool.max_conns {
let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.inc();
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "local_pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
}
}
impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
fn drop(&mut self) {
if self.total_conns > 0 {
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(self.total_conns as i64);
}
}
}
pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
conns: Vec<ConnPoolEntry<C>>,
// true if we have definitely installed the extension and
// granted the role access to the auth schema.
initialized: bool,
}
impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
fn default() -> Self {
Self {
conns: Vec::new(),
initialized: false,
}
}
}
impl<C: ClientInnerExt> DbUserConnPool<C> {
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
*conns -= removed;
removed
}
fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry<C>> {
let mut removed = self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
*conns -= 1;
removed += 1;
}
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
conn
}
}
pub(crate) struct LocalConnPool<C: ClientInnerExt> {
global_pool: Arc<RwLock<EndpointConnPool<C>>>,
global_pool: RwLock<EndpointConnPool<C>>,
config: &'static crate::config::HttpConfig,
}
@@ -70,14 +204,12 @@ pub(crate) struct LocalConnPool<C: ClientInnerExt> {
impl<C: ClientInnerExt> LocalConnPool<C> {
pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
Arc::new(Self {
global_pool: Arc::new(RwLock::new(EndpointConnPool::new(
HashMap::new(),
0,
config.pool_options.max_conns_per_endpoint,
Arc::new(AtomicUsize::new(0)),
config.pool_options.max_total_conns,
String::from("local_pool"),
))),
global_pool: RwLock::new(EndpointConnPool {
pools: HashMap::new(),
total_conns: 0,
max_conns: config.pool_options.max_conns_per_endpoint,
global_pool_size_max_conns: config.pool_options.max_total_conns,
}),
config,
})
}
@@ -90,7 +222,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
) -> Result<Option<LocalClient<C>>, HttpConnError> {
let client = self
.global_pool
.write()
@@ -98,14 +230,12 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
.map(|entry| entry.conn);
// ok return cached connection if found and establish a new one otherwise
if let Some(mut client) = client {
if client.inner.is_closed() {
if let Some(client) = client {
if client.is_closed() {
info!("local_pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner.get_process_id()),
@@ -114,59 +244,47 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"local_pool: reusing connection '{conn_info}'"
);
match client.get_data() {
ClientDataEnum::Local(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Remote(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Http(_) => (),
}
client.session.send(ctx.session_id())?;
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(Client::new(
return Ok(Some(LocalClient::new(
client,
conn_info.clone(),
Arc::downgrade(&self.global_pool),
Arc::downgrade(self),
)));
}
Ok(None)
}
pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
if let Some(pool) = self.global_pool.read().get_pool(conn_info.db_and_user()) {
return pool.is_initialized();
}
false
self.global_pool
.read()
.pools
.get(&conn_info.db_and_user())
.map_or(false, |pool| pool.initialized)
}
pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
if let Some(pool) = self
.global_pool
self.global_pool
.write()
.get_pool_mut(conn_info.db_and_user())
{
pool.set_initialized();
}
.pools
.entry(conn_info.db_and_user())
.or_default()
.initialized = true;
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn poll_client<C: ClientInnerExt>(
global_pool: Arc<LocalConnPool<C>>,
pub(crate) fn poll_client(
global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
ctx: &RequestMonitoring,
conn_info: ConnInfo,
client: C,
client: tokio_postgres::Client,
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
key: SigningKey,
conn_id: uuid::Uuid,
aux: MetricsAuxInfo,
) -> Client<C> {
) -> LocalClient<tokio_postgres::Client> {
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
let mut session_id = ctx.session_id();
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
@@ -259,47 +377,111 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
}
.instrument(span));
let inner = ClientInnerCommon {
let inner = ClientInner {
inner: client,
session: tx,
cancel,
aux,
conn_id,
data: ClientDataEnum::Local(ClientDataLocal {
session: tx,
cancel,
key,
jti: 0,
}),
key,
jti: 0,
};
Client::new(
inner,
conn_info,
Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool),
)
LocalClient::new(inner, conn_info, pool_clone)
}
impl ClientInnerCommon<tokio_postgres::Client> {
pub(crate) struct ClientInner<C: ClientInnerExt> {
inner: C,
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
// needed for pg_session_jwt state
key: SigningKey,
jti: u64,
}
impl<C: ClientInnerExt> Drop for ClientInner<C> {
fn drop(&mut self) {
// on client drop, tell the conn to shut down
self.cancel.cancel();
}
}
impl<C: ClientInnerExt> ClientInner<C> {
pub(crate) fn is_closed(&self) -> bool {
self.inner.is_closed()
}
}
impl ClientInner<tokio_postgres::Client> {
pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
if let ClientDataEnum::Local(local_data) = &mut self.data {
local_data.jti += 1;
let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
self.jti += 1;
let token = resign_jwt(&self.key, payload, self.jti)?;
// initiates the auth session
self.inner.simple_query("discard all").await?;
self.inner
.query(
"select auth.jwt_session_init($1)",
&[&token as &(dyn ToSql + Sync)],
)
.await?;
// initiates the auth session
self.inner.simple_query("discard all").await?;
self.inner
.query(
"select auth.jwt_session_init($1)",
&[&token as &(dyn ToSql + Sync)],
)
.await?;
let pid = self.inner.get_process_id();
info!(pid, jti = local_data.jti, "user session state init");
Ok(())
} else {
panic!("unexpected client data type");
let pid = self.inner.get_process_id();
info!(pid, jti = self.jti, "user session state init");
Ok(())
}
}
pub(crate) struct LocalClient<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInner<C>>,
conn_info: ConnInfo,
pool: Weak<LocalConnPool<C>>,
}
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<LocalConnPool<C>>,
}
impl<C: ClientInnerExt> LocalClient<C> {
pub(self) fn new(
inner: ClientInner<C>,
conn_info: ConnInfo,
pool: Weak<LocalConnPool<C>>,
) -> Self {
Self {
inner: Some(inner),
span: Span::current(),
conn_info,
pool,
}
}
pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner_m = inner.as_mut().expect("client inner should not be removed");
(inner_m, Discard { conn_info, pool })
}
pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner = inner.as_mut().expect("client inner should not be removed");
(&mut inner.inner, Discard { conn_info, pool })
}
}
/// implements relatively efficient in-place json object key upserting
@@ -365,6 +547,58 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
jwt
}
impl<C: ClientInnerExt> LocalClient<C> {
pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
})
}
fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
let conn_info = self.conn_info.clone();
let client = self
.inner
.take()
.expect("client inner should not be removed");
if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
let current_span = self.span.clone();
// return connection to the pool
return Some(move || {
let _span = current_span.enter();
EndpointConnPool::put(&conn_pool.global_pool, &conn_info, client);
});
}
None
}
}
impl<C: ClientInnerExt> Drop for LocalClient<C> {
fn drop(&mut self) {
if let Some(drop) = self.do_drop() {
tokio::task::spawn_blocking(drop);
}
}
}
impl<C: ClientInnerExt> Discard<'_, C> {
pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
let conn_info = &self.conn_info;
if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
info!(
"local_pool: throwing away connection '{conn_info}' because connection is not idle"
);
}
}
pub(crate) fn discard(&mut self) {
let conn_info = &self.conn_info;
if std::mem::take(self.pool).strong_count() > 0 {
info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
}
}
}
#[cfg(test)]
mod tests {
use p256::ecdsa::SigningKey;

View File

@@ -44,10 +44,10 @@ use tracing::{info, warn, Instrument};
use utils::http::error::ApiError;
use crate::cancellation::CancellationHandlerMain;
use crate::config::{ProxyConfig, ProxyProtocolV2};
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::metrics::Metrics;
use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
use crate::protocol2::{read_proxy_protocol, ChainRW};
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
@@ -180,7 +180,7 @@ pub async fn task_main(
peer_addr,
))
.await;
let Some((conn, conn_info)) = startup_result else {
let Some((conn, peer_addr)) = startup_result else {
return;
};
@@ -192,7 +192,7 @@ pub async fn task_main(
endpoint_rate_limiter,
conn_token,
conn,
conn_info,
peer_addr,
session_id,
))
.await;
@@ -240,7 +240,7 @@ async fn connection_startup(
session_id: uuid::Uuid,
conn: TcpStream,
peer_addr: SocketAddr,
) -> Option<(AsyncRW, ConnectionInfo)> {
) -> Option<(AsyncRW, IpAddr)> {
// handle PROXY protocol
let (conn, peer) = match read_proxy_protocol(conn).await {
Ok(c) => c,
@@ -250,37 +250,17 @@ async fn connection_startup(
}
};
let conn_info = match peer {
// our load balancers will not send any more data. let's just exit immediately
ConnectHeader::Local => {
tracing::debug!("healthcheck received");
return None;
}
ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
tracing::warn!("missing required proxy protocol header");
return None;
}
ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
tracing::warn!("proxy protocol header not supported");
return None;
}
ConnectHeader::Proxy(info) => info,
ConnectHeader::Missing => ConnectionInfo {
addr: peer_addr,
extra: None,
},
};
let has_private_peer_addr = match conn_info.addr.ip() {
let peer_addr = peer.unwrap_or(peer_addr).ip();
let has_private_peer_addr = match peer_addr {
IpAddr::V4(ip) => ip.is_private(),
IpAddr::V6(_) => false,
};
info!(?session_id, %conn_info, "accepted new TCP connection");
info!(?session_id, %peer_addr, "accepted new TCP connection");
// try upgrade to TLS, but with a timeout.
let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
Ok(Ok(conn)) => {
info!(?session_id, %conn_info, "accepted new TLS connection");
info!(?session_id, %peer_addr, "accepted new TLS connection");
conn
}
// The handshake failed
@@ -288,7 +268,7 @@ async fn connection_startup(
if !has_private_peer_addr {
Metrics::get().proxy.tls_handshake_failures.inc();
}
warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return None;
}
// The handshake timed out
@@ -296,12 +276,12 @@ async fn connection_startup(
if !has_private_peer_addr {
Metrics::get().proxy.tls_handshake_failures.inc();
}
warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return None;
}
};
Some((conn, conn_info))
Some((conn, peer_addr))
}
/// Handles HTTP connection
@@ -317,7 +297,7 @@ async fn connection_handler(
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
conn: AsyncRW,
conn_info: ConnectionInfo,
peer_addr: IpAddr,
session_id: uuid::Uuid,
) {
let session_id = AtomicTake::new(session_id);
@@ -326,7 +306,6 @@ async fn connection_handler(
let http_cancellation_token = CancellationToken::new();
let _cancel_connection = http_cancellation_token.clone().drop_guard();
let conn_info2 = conn_info.clone();
let server = Builder::new(TokioExecutor::new());
let conn = server.serve_connection_with_upgrades(
hyper_util::rt::TokioIo::new(conn),
@@ -361,7 +340,7 @@ async fn connection_handler(
connections.clone(),
cancellation_handler.clone(),
session_id,
conn_info2.clone(),
peer_addr,
http_request_token,
endpoint_rate_limiter.clone(),
)
@@ -386,7 +365,7 @@ async fn connection_handler(
// On cancellation, trigger the HTTP connection handler to shut down.
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
Either::Left((_cancelled, mut conn)) => {
tracing::debug!(%conn_info, "cancelling connection");
tracing::debug!(%peer_addr, "cancelling connection");
conn.as_mut().graceful_shutdown();
conn.await
}
@@ -394,8 +373,8 @@ async fn connection_handler(
};
match res {
Ok(()) => tracing::info!(%conn_info, "HTTP connection closed"),
Err(e) => tracing::warn!(%conn_info, "HTTP connection error {e}"),
Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
}
}
@@ -407,7 +386,7 @@ async fn request_handler(
ws_connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
session_id: uuid::Uuid,
conn_info: ConnectionInfo,
peer_addr: IpAddr,
// used to cancel in-flight HTTP requests. not used to cancel websockets
http_cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -425,7 +404,7 @@ async fn request_handler(
{
let ctx = RequestMonitoring::new(
session_id,
conn_info,
peer_addr,
crate::metrics::Protocol::Ws,
&config.region,
);
@@ -460,7 +439,7 @@ async fn request_handler(
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
let ctx = RequestMonitoring::new(
session_id,
conn_info,
peer_addr,
crate::metrics::Protocol::Http,
&config.region,
);

Some files were not shown because too many files have changed in this diff Show More