Merge pull request #10354 from neondatabase/rc/release-compute/2025-01-10

This commit is contained in:
Tristan Partin
2025-01-10 22:00:23 -06:00
committed by GitHub
120 changed files with 4973 additions and 1627 deletions

12
.github/file-filters.yaml vendored Normal file
View File

@@ -0,0 +1,12 @@
rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**']
v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**']
rebuild_neon_extra:
- .github/workflows/neon_extra_builds.yml
rebuild_macos:
- .github/workflows/build-macos.yml

241
.github/workflows/build-macos.yml vendored Normal file
View File

@@ -0,0 +1,241 @@
name: Check neon with MacOS builds
on:
workflow_call:
inputs:
pg_versions:
description: "Array of the pg versions to build for, for example: ['v14', 'v17']"
type: string
default: '[]'
required: false
rebuild_rust_code:
description: "Rebuild Rust code"
type: boolean
default: false
required: false
rebuild_everything:
description: "If true, rebuild for all versions"
type: boolean
default: false
required: false
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow
# We should care about that as Github has limitations:
# - You can connect up to four levels of workflows
# - You can call a maximum of 20 unique reusable workflows from a single workflow file.
# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations
jobs:
build-pgxn:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
strategy:
matrix:
postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
- name: Set pg ${{ matrix.postgres-version }} for caching
id: pg_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres ${{ matrix.postgres-version }} build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/${{ matrix.postgres-version }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
git submodule init vendor/postgres-${{ matrix.postgres-version }}
git submodule update --depth 1 --recursive
- name: Install build dependencies
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build Postgres ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
- name: Build Neon Pg Ext ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
- name: Get postgres headers ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
build-walproposer-lib:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
needs: [build-pgxn]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
- name: Set pg v17 for caching
id: pg_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres v17 build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache walproposer-lib
id: cache_walproposer_lib
uses: actions/cache@v4
with:
path: pg_install/build/walproposer-lib
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Checkout submodule vendor/postgres-v17
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
git submodule init vendor/postgres-v17
git submodule update --depth 1 --recursive
- name: Install build dependencies
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build walproposer-lib (only for v17)
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run:
make walproposer-lib -j$(sysctl -n hw.ncpu)
cargo-build:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
needs: [build-pgxn, build-walproposer-lib]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
with:
submodules: true
- name: Set pg v14 for caching
id: pg_rev_v14
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v15 for caching
id: pg_rev_v15
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v16 for caching
id: pg_rev_v16
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v17 for caching
id: pg_rev_v17
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres v14 build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_v15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_v16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_v17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache cargo deps (only for v17)
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Cache walproposer-lib
id: cache_walproposer_lib
uses: actions/cache@v4
with:
path: pg_install/build/walproposer-lib
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Install build dependencies
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Run cargo build (only for v17)
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
- name: Check that no warnings are produced (only for v17)
run: ./run_clippy.sh

View File

@@ -31,19 +31,15 @@ jobs:
uses: ./.github/workflows/build-build-tools-image.yml
secrets: inherit
check-macos-build:
needs: [ check-permissions ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
timeout-minutes: 90
runs-on: macos-15
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
files-changed:
name: Detect what files changed
runs-on: ubuntu-22.04
timeout-minutes: 3
outputs:
v17: ${{ steps.files_changed.outputs.v17 }}
postgres_changes: ${{ steps.postgres_changes.outputs.changes }}
rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }}
rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }}
steps:
- name: Checkout
@@ -51,106 +47,45 @@ jobs:
with:
submodules: true
- name: Install macOS postgres dependencies
run: brew install flex bison openssl protobuf icu4c
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set pg 17 revision for caching
id: pg_v17_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
- name: Check for Postgres changes
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
id: files_changed
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
token: ${{ github.token }}
filters: .github/file-filters.yaml
base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }}
ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
- name: Filter out only v-string for build matrix
id: postgres_changes
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
- name: Cache cargo deps
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: make postgres-v14 -j$(sysctl -n hw.ncpu)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: make postgres-v15 -j$(sysctl -n hw.ncpu)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: make postgres-v17 -j$(sysctl -n hw.ncpu)
- name: Build neon extensions
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
- name: Build walproposer-lib
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
- name: Run cargo build
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
- name: Check that no warnings are produced
run: ./run_clippy.sh
check-macos-build:
needs: [ check-permissions, files-changed ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
uses: ./.github/workflows/build-macos.yml
with:
pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-build-tools-image, files-changed ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: write
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
(needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
runs-on: [ self-hosted, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm

218
Cargo.lock generated
View File

@@ -718,13 +718,13 @@ dependencies = [
[[package]]
name = "axum"
version = "0.7.5"
version = "0.7.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
dependencies = [
"async-trait",
"axum-core",
"base64 0.21.1",
"base64 0.22.1",
"bytes",
"futures-util",
"http 1.1.0",
@@ -746,8 +746,8 @@ dependencies = [
"sha1",
"sync_wrapper 1.0.1",
"tokio",
"tokio-tungstenite",
"tower",
"tokio-tungstenite 0.24.0",
"tower 0.5.2",
"tower-layer",
"tower-service",
"tracing",
@@ -1267,6 +1267,7 @@ dependencies = [
"aws-config",
"aws-sdk-kms",
"aws-sdk-s3",
"axum",
"base64 0.13.1",
"bytes",
"camino",
@@ -1277,7 +1278,7 @@ dependencies = [
"fail",
"flate2",
"futures",
"hyper 0.14.30",
"http 1.1.0",
"metrics",
"nix 0.27.1",
"notify",
@@ -1303,6 +1304,8 @@ dependencies = [
"tokio-postgres",
"tokio-stream",
"tokio-util",
"tower 0.5.2",
"tower-http",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -1650,6 +1653,20 @@ dependencies = [
"parking_lot_core 0.9.8",
]
[[package]]
name = "dashmap"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
]
[[package]]
name = "data-encoding"
version = "2.4.0"
@@ -1949,6 +1966,15 @@ dependencies = [
"syn 2.0.90",
]
[[package]]
name = "env_filter"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
dependencies = [
"log",
]
[[package]]
name = "env_logger"
version = "0.10.2"
@@ -1962,6 +1988,16 @@ dependencies = [
"termcolor",
]
[[package]]
name = "env_logger"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
dependencies = [
"env_filter",
"log",
]
[[package]]
name = "equator"
version = "0.2.2"
@@ -2720,7 +2756,7 @@ dependencies = [
"pin-project-lite",
"socket2",
"tokio",
"tower",
"tower 0.4.13",
"tower-service",
"tracing",
]
@@ -2945,6 +2981,28 @@ dependencies = [
"str_stack",
]
[[package]]
name = "inferno"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
dependencies = [
"ahash",
"clap",
"crossbeam-channel",
"crossbeam-utils",
"dashmap 6.1.0",
"env_logger 0.11.2",
"indexmap 2.0.1",
"itoa",
"log",
"num-format",
"once_cell",
"quick-xml 0.37.1",
"rgb",
"str_stack",
]
[[package]]
name = "inotify"
version = "0.9.6"
@@ -3152,7 +3210,7 @@ version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
dependencies = [
"dashmap",
"dashmap 5.5.0",
"hashbrown 0.13.2",
]
@@ -3260,9 +3318,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
[[package]]
name = "matchit"
version = "0.8.2"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
[[package]]
name = "md-5"
@@ -3690,23 +3748,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "opentelemetry"
version = "0.26.0"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.26.0"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
dependencies = [
"async-trait",
"bytes",
@@ -3717,9 +3775,9 @@ dependencies = [
[[package]]
name = "opentelemetry-otlp"
version = "0.26.0"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
dependencies = [
"async-trait",
"futures-core",
@@ -3735,9 +3793,9 @@ dependencies = [
[[package]]
name = "opentelemetry-proto"
version = "0.26.1"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
@@ -3747,22 +3805,21 @@ dependencies = [
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.26.0"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
[[package]]
name = "opentelemetry_sdk"
version = "0.26.0"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
dependencies = [
"async-trait",
"futures-channel",
"futures-executor",
"futures-util",
"glob",
"once_cell",
"opentelemetry",
"percent-encoding",
"rand 0.8.5",
@@ -3770,6 +3827,7 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
@@ -4418,7 +4476,7 @@ dependencies = [
"bytes",
"crc32c",
"criterion",
"env_logger",
"env_logger 0.10.2",
"log",
"memoffset 0.9.0",
"once_cell",
@@ -4459,7 +4517,7 @@ dependencies = [
"cfg-if",
"criterion",
"findshlibs",
"inferno",
"inferno 0.11.21",
"libc",
"log",
"nix 0.26.4",
@@ -4685,9 +4743,9 @@ dependencies = [
"clap",
"compute_api",
"consumption_metrics",
"dashmap",
"dashmap 5.5.0",
"ecdsa 0.16.9",
"env_logger",
"env_logger 0.10.2",
"fallible-iterator",
"flate2",
"framed-websockets",
@@ -4758,7 +4816,7 @@ dependencies = [
"tokio-postgres",
"tokio-postgres2",
"tokio-rustls 0.26.0",
"tokio-tungstenite",
"tokio-tungstenite 0.21.0",
"tokio-util",
"tracing",
"tracing-subscriber",
@@ -4794,6 +4852,15 @@ dependencies = [
"serde",
]
[[package]]
name = "quick-xml"
version = "0.37.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.37"
@@ -5178,15 +5245,15 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.5.4"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
dependencies = [
"anyhow",
"async-trait",
"getrandom 0.2.11",
"http 1.1.0",
"matchit 0.8.2",
"matchit 0.8.4",
"opentelemetry",
"reqwest",
"reqwest-middleware",
@@ -6800,7 +6867,19 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.21.0",
]
[[package]]
name = "tokio-tungstenite"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.24.0",
]
[[package]]
@@ -6881,7 +6960,7 @@ dependencies = [
"tokio",
"tokio-rustls 0.26.0",
"tokio-stream",
"tower",
"tower 0.4.13",
"tower-layer",
"tower-service",
"tracing",
@@ -6922,16 +7001,49 @@ dependencies = [
]
[[package]]
name = "tower-layer"
version = "0.3.2"
name = "tower"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
dependencies = [
"futures-core",
"futures-util",
"pin-project-lite",
"sync_wrapper 1.0.1",
"tokio",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-http"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
dependencies = [
"bitflags 2.4.1",
"bytes",
"http 1.1.0",
"http-body 1.0.0",
"pin-project-lite",
"tower-layer",
"tower-service",
"tracing",
"uuid",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-service"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
@@ -7000,9 +7112,9 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.27.0"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
dependencies = [
"js-sys",
"once_cell",
@@ -7086,6 +7198,24 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -7253,6 +7383,7 @@ dependencies = [
"hex-literal",
"humantime",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"jsonwebtoken",
@@ -7356,7 +7487,7 @@ dependencies = [
"anyhow",
"camino-tempfile",
"clap",
"env_logger",
"env_logger 0.10.2",
"log",
"postgres",
"postgres_ffi",
@@ -7867,7 +7998,8 @@ dependencies = [
"tokio-util",
"toml_edit",
"tonic",
"tower",
"tower 0.4.13",
"tower 0.5.2",
"tracing",
"tracing-core",
"url",

View File

@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.7.5", features = ["ws"] }
axum = { version = "0.7.9", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.70"
@@ -110,6 +110,7 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
@@ -126,10 +127,10 @@ notify = "6.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.26"
opentelemetry_sdk = "0.26"
opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.26"
opentelemetry = "0.27"
opentelemetry_sdk = "0.27"
opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.27"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
@@ -143,7 +144,7 @@ rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
reqwest-middleware = "0.4"
reqwest-retry = "0.7"
routerify = "3"
@@ -187,10 +188,12 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
tower-service = "0.3.2"
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
tower-service = "0.3.3"
tracing = "0.1"
tracing-error = "0.2"
tracing-opentelemetry = "0.27"
tracing-opentelemetry = "0.28"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
twox-hash = { version = "1.6.3", default-features = false }

View File

@@ -103,11 +103,6 @@ RUN mkdir -p /data/.neon/ && \
> /data/.neon/pageserver.toml && \
chown -R neon:neon /data/.neon
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
VOLUME ["/data"]
USER neon
EXPOSE 6400

View File

@@ -258,7 +258,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.83.0
ENV RUSTC_VERSION=1.84.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1

View File

@@ -1167,22 +1167,13 @@ FROM rust-extensions-build AS pg-mooncake-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# The topmost commit in the `neon` branch at the time of writing this
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
'v14') \
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
esac && \
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
cd pg_mooncake-src && \
git checkout "${PG_MOONCAKE_VERSION}" && \
git submodule update --init --depth 1 --recursive && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
make release -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
#########################################################################################

View File

@@ -15,6 +15,7 @@ aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-sdk-kms.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -22,7 +23,7 @@ clap.workspace = true
fail.workspace = true
flate2.workspace = true
futures.workspace = true
hyper0 = { workspace = true, features = ["full"] }
http.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true
@@ -37,6 +38,8 @@ serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
tower.workspace = true
tower-http.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true

View File

@@ -60,7 +60,7 @@ use compute_tools::compute::{
};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::api::launch_http_server;
use compute_tools::http::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
@@ -111,11 +111,6 @@ fn main() -> Result<()> {
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
opentelemetry::global::set_error_handler(|err| {
tracing::info!("OpenTelemetry error: {err}");
})
.expect("global error handler lock poisoned");
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
thread::spawn(move || {
for sig in signals.forever() {
@@ -493,7 +488,10 @@ fn start_postgres(
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => Some(pg),
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
@@ -591,6 +589,8 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");

View File

@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat
#[derive(Debug, thiserror::Error)]
pub enum SchemaDumpError {
#[error("Database does not exist.")]
#[error("database does not exist")]
DatabaseDoesNotExist,
#[error("Failed to execute pg_dump.")]
#[error("failed to execute pg_dump")]
IO(#[from] std::io::Error),
#[error("Unexpected error.")]
#[error("unexpected I/O error")]
Unexpected,
}

View File

@@ -15,7 +15,7 @@ use std::time::Instant;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use compute_api::spec::{PgIdent, Role};
use compute_api::spec::{Database, PgIdent, Role};
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
@@ -45,8 +45,10 @@ use crate::spec_apply::ApplySpecPhase::{
DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase;
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
HandleAnonExtension,
};
use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -834,7 +836,7 @@ impl ComputeNode {
conf
}
async fn get_maintenance_client(
pub async fn get_maintenance_client(
conf: &tokio_postgres::Config,
) -> Result<tokio_postgres::Client> {
let mut conf = conf.clone();
@@ -943,6 +945,78 @@ impl ComputeNode {
dbs: databases,
}));
// Apply special pre drop database phase.
// NOTE: we use the code of RunInEachDatabase phase for parallelism
// and connection management, but we don't really run it in *each* database,
// only in databases, we're about to drop.
info!("Applying PerDatabase (pre-dropdb) phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
// Run the phase for each database that we're about to drop.
let db_processes = spec
.delta_operations
.iter()
.flatten()
.filter_map(move |op| {
if op.action.as_str() == "delete_db" {
Some(op.name.clone())
} else {
None
}
})
.map(|dbname| {
let spec = spec.clone();
let ctx = ctx.clone();
let jwks_roles = jwks_roles.clone();
let mut conf = conf.as_ref().clone();
let concurrency_token = concurrency_token.clone();
// We only need dbname field for this phase, so set other fields to dummy values
let db = DB::UserDB(Database {
name: dbname.clone(),
owner: "cloud_admin".to_string(),
options: None,
restrict_conn: false,
invalid: false,
});
debug!("Applying per-database phases for Database {:?}", &db);
match &db {
DB::SystemDB => {}
DB::UserDB(db) => {
conf.dbname(db.name.as_str());
}
}
let conf = Arc::new(conf);
let fut = Self::apply_spec_sql_db(
spec.clone(),
conf,
ctx.clone(),
jwks_roles.clone(),
concurrency_token.clone(),
db,
[DropSubscriptionsForDeletedDatabases].to_vec(),
);
Ok(spawn(fut))
})
.collect::<Vec<Result<_, anyhow::Error>>>();
for process in db_processes.into_iter() {
let handle = process?;
if let Err(e) = handle.await? {
// Handle the error case where the database does not exist
// We do not check whether the DB exists or not in the deletion phase,
// so we shouldn't be strict about it in pre-deletion cleanup as well.
if e.to_string().contains("does not exist") {
warn!("Error dropping subscription: {}", e);
} else {
return Err(e);
}
};
}
for phase in [
CreateSuperUser,
DropInvalidDatabases,
@@ -962,7 +1036,7 @@ impl ComputeNode {
.await?;
}
info!("Applying RunInEachDatabase phase");
info!("Applying RunInEachDatabase2 phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
let db_processes = spec
@@ -997,6 +1071,12 @@ impl ComputeNode {
jwks_roles.clone(),
concurrency_token.clone(),
db,
[
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
]
.to_vec(),
);
Ok(spawn(fut))
@@ -1043,16 +1123,13 @@ impl ComputeNode {
jwks_roles: Arc<HashSet<String>>,
concurrency_token: Arc<tokio::sync::Semaphore>,
db: DB,
subphases: Vec<PerDatabasePhase>,
) -> Result<()> {
let _permit = concurrency_token.acquire().await?;
let mut client_conn = None;
for subphase in [
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
] {
for subphase in subphases {
apply_operations(
spec.clone(),
ctx.clone(),

View File

@@ -1,606 +0,0 @@
use std::convert::Infallible;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use crate::installed_extensions;
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
use compute_api::responses::{
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
SetRoleGrantsResponse,
};
use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use metrics::proto::MetricFamily;
use metrics::Encoder;
use metrics::TextEncoder;
use tokio::task;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
use tracing_utils::http::OtelName;
use utils::failpoint_support::failpoints_handler;
use utils::http::error::ApiError;
use utils::http::request::must_get_query_param;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
start_time: state.start_time,
tenant: state
.pspec
.as_ref()
.map(|pspec| pspec.tenant_id.to_string()),
timeline: state
.pspec
.as_ref()
.map(|pspec| pspec.timeline_id.to_string()),
status: state.status,
last_active: state.last_active,
error: state.error.clone(),
}
}
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
//
// NOTE: The URI path is currently included in traces. That's OK because
// it doesn't contain any variable parts or sensitive information. But
// please keep that in mind if you change the routing here.
//
match (req.method(), req.uri().path()) {
// Serialized compute state.
(&Method::GET, "/status") => {
debug!("serving /status GET request");
let state = compute.state.lock().unwrap();
let status_response = status_response_from_state(&state);
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
}
// Startup metrics in JSON format. Keep /metrics reserved for a possible
// future use for Prometheus metrics format.
(&Method::GET, "/metrics.json") => {
info!("serving /metrics.json GET request");
let metrics = compute.state.lock().unwrap().metrics.clone();
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
}
// Prometheus metrics
(&Method::GET, "/metrics") => {
debug!("serving /metrics GET request");
// When we call TextEncoder::encode() below, it will immediately
// return an error if a metric family has no metrics, so we need to
// preemptively filter out metric families with no metrics.
let metrics = installed_extensions::collect()
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
let mut buffer = vec![];
if let Err(err) = encoder.encode(&metrics, &mut buffer) {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
}
match Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
{
Ok(response) => response,
Err(err) => {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Collect Postgres current usage insights
(&Method::GET, "/insights") => {
info!("serving /insights GET request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!("compute is not running, current status: {:?}", status);
error!(msg);
return Response::new(Body::from(msg));
}
let insights = compute.collect_insights().await;
Response::new(Body::from(insights))
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for check_writability request: {:?}",
status
);
error!(msg);
return Response::new(Body::from(msg));
}
let res = crate::checker::check_writability(compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => {
error!("check_writability failed: {}", e);
Response::new(Body::from(e.to_string()))
}
}
}
(&Method::POST, "/extensions") => {
info!("serving /extensions POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for extensions request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
let res = compute
.install_extension(&request.extension, &request.database, request.version)
.await;
match res {
Ok(version) => render_json(Body::from(
serde_json::to_string(&ExtensionInstallResult {
extension: request.extension,
version,
})
.unwrap(),
)),
Err(e) => {
error!("install_extension failed: {}", e);
render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/info") => {
let num_cpus = num_cpus::get_physical();
info!("serving /info GET request. num_cpus: {}", num_cpus);
Response::new(Body::from(
serde_json::json!({
"num_cpus": num_cpus,
})
.to_string(),
))
}
// Accept spec in JSON format and request compute configuration. If
// anything goes wrong after we set the compute status to `ConfigurationPending`
// and update compute state with new spec, we basically leave compute
// in the potentially wrong state. That said, it's control-plane's
// responsibility to watch compute state after reconfiguration request
// and to clean restart in case of errors.
(&Method::POST, "/configure") => {
info!("serving /configure POST request");
match handle_configure_request(req, compute).await {
Ok(msg) => Response::new(Body::from(msg)),
Err((msg, code)) => {
error!("error handling /configure request: {msg}");
render_json_error(&msg, code)
}
}
}
(&Method::POST, "/terminate") => {
info!("serving /terminate POST request");
match handle_terminate_request(compute).await {
Ok(()) => Response::new(Body::empty()),
Err((msg, code)) => {
error!("error handling /terminate request: {msg}");
render_json_error(&msg, code)
}
}
}
(&Method::GET, "/dbs_and_roles") => {
info!("serving /dbs_and_roles GET request",);
match get_dbs_and_roles(compute).await {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(_) => {
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/database_schema") => {
let database = match must_get_query_param(&req, "database") {
Err(e) => return e.into_response(),
Ok(database) => database,
};
info!("serving /database_schema GET request with database: {database}",);
match get_database_schema(compute, &database).await {
Ok(res) => render_plain(Body::wrap_stream(res)),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
render_json_error("database does not exist", StatusCode::NOT_FOUND)
}
Err(e) => {
error!("can't get schema dump: {}", e);
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::POST, "/grants") => {
info!("serving /grants POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for set_role_grants request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
let res = compute
.set_role_grants(
&request.database,
&request.schema,
&request.privileges,
&request.role,
)
.await;
match res {
Ok(()) => render_json(Body::from(
serde_json::to_string(&SetRoleGrantsResponse {
database: request.database,
schema: request.schema,
role: request.role,
privileges: request.privileges,
})
.unwrap(),
)),
Err(e) => render_json_error(
&format!("could not grant role privileges to the schema: {e}"),
// TODO: can we filter on role/schema not found errors
// and return appropriate error code?
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
// get the list of installed extensions
// currently only used in python tests
// TODO: call it from cplane
(&Method::GET, "/installed_extensions") => {
info!("serving /installed_extensions GET request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for extensions request: {:?}",
status
);
error!(msg);
return Response::new(Body::from(msg));
}
let conf = compute.get_conn_conf(None);
let res =
task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
.await
.unwrap();
match res {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(e) => render_json_error(
&format!("could not get list of installed extensions: {}", e),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
(&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
match failpoints_handler(req, CancellationToken::new()).await {
Ok(r) => r,
Err(ApiError::BadRequest(e)) => {
render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
}
Err(_) => {
render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
// don't even try to download extensions
// if no remote storage is configured
if compute.ext_remote_storage.is_none() {
info!("no extensions remote storage configured");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
let mut is_library = false;
if let Some(params) = req.uri().query() {
info!("serving {:?} POST request with params: {}", route, params);
if params == "is_library=true" {
is_library = true;
} else {
let mut resp = Response::new(Body::from("Wrong request parameters"));
*resp.status_mut() = StatusCode::BAD_REQUEST;
return resp;
}
}
let filename = route.split('/').last().unwrap().to_string();
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
// get ext_name and path from spec
// don't lock compute_state for too long
let ext = {
let compute_state = compute.state.lock().unwrap();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
// debug only
info!("spec: {:?}", spec);
let remote_extensions = match spec.remote_extensions.as_ref() {
Some(r) => r,
None => {
info!("no remote extensions spec was provided");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
};
remote_extensions.get_ext(
&filename,
is_library,
&compute.build_tag,
&compute.pgversion,
)
};
match ext {
Ok((ext_name, ext_path)) => {
match compute.download_extension(ext_name, ext_path).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
Err(e) => {
warn!("extension download failed to find extension: {}", e);
let mut resp = Response::new(Body::from("failed to find file"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));
*not_found.status_mut() = StatusCode::NOT_FOUND;
not_found
}
}
}
async fn handle_configure_request(
req: Request<Body>,
compute: &Arc<ComputeNode>,
) -> Result<String, (String, StatusCode)> {
if !compute.live_config_allowed {
return Err((
"live configuration is not allowed for this compute node".to_string(),
StatusCode::PRECONDITION_FAILED,
));
}
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
let spec = request.spec;
let parsed_spec = match ParsedSpec::try_from(spec) {
Ok(ps) => ps,
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
};
// XXX: wrap state update under lock in code blocks. Otherwise,
// we will try to `Send` `mut state` into the spawned thread
// bellow, which will cause error:
// ```
// error: future cannot be sent between threads safely
// ```
{
let mut state = compute.state.lock().unwrap();
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for configuration request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.pspec = Some(parsed_spec);
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
info!("set new spec and notified waiters");
}
// Spawn a blocking thread to wait for compute to become Running.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Running, current status: {:?}",
state.status
);
if state.status == ComputeStatus::Failed {
let err = state.error.as_ref().map_or("unknown error", |x| x);
let msg = format!("compute configuration failed: {:?}", err);
return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
}
}
Ok(())
})
.await
.unwrap()?;
// Return current compute state if everything went well.
let state = compute.state.lock().unwrap().clone();
let status_response = status_response_from_state(&state);
Ok(serde_json::to_string(&status_response).unwrap())
} else {
Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
}
}
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
let error = GenericAPIError {
error: e.to_string(),
};
Response::builder()
.status(status)
.header(CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
fn render_json(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "application/json")
.body(body)
.unwrap()
}
fn render_plain(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "text/plain")
.body(body)
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return Ok(());
}
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for termination request: {}",
state.status
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {:?}",
ComputeStatus::Terminated,
state.status
);
}
Ok(())
})
.await
.unwrap()?;
info!("terminated Postgres");
Ok(())
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {
// this usually binds to both IPv4 and IPv6 on linux
// see e.g. https://github.com/rust-lang/rust/pull/34440
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let make_service = make_service_fn(move |_conn| {
let state = state.clone();
async move {
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
let state = state.clone();
async move {
Ok::<_, Infallible>(
// NOTE: We include the URI path in the string. It
// doesn't contain any variable parts or sensitive
// information in this API.
tracing_utils::http::tracing_handler(
req,
|req| routes(req, &state),
OtelName::UriPath,
)
.await,
)
}
}))
}
});
info!("starting HTTP server on {}", addr);
let server = Server::bind(&addr).serve(make_service);
// Run this server forever
if let Err(e) = server.await {
error!("server error: {}", e);
}
}
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-endpoint".into())
.spawn(move || serve(port, state))?)
}

View File

@@ -0,0 +1,48 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::JsonRejection, FromRequest, Request},
};
use compute_api::responses::GenericAPIError;
use http::StatusCode;
/// Custom `Json` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Json<T>(pub T);
#[async_trait]
impl<S, T> FromRequest<S> for Json<T>
where
axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
match axum::Json::<T>::from_request(req, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Json<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Json<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -0,0 +1,7 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;

View File

@@ -0,0 +1,48 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::PathRejection, FromRequestParts},
};
use compute_api::responses::GenericAPIError;
use http::{request::Parts, StatusCode};
/// Custom `Path` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Path<T>(pub T);
#[async_trait]
impl<S, T> FromRequestParts<S> for Path<T>
where
axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
match axum::extract::Path::<T>::from_request_parts(parts, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_ascii_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Path<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Path<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -0,0 +1,48 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::QueryRejection, FromRequestParts},
};
use compute_api::responses::GenericAPIError;
use http::{request::Parts, StatusCode};
/// Custom `Query` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Query<T>(pub T);
#[async_trait]
impl<S, T> FromRequestParts<S> for Query<T>
where
axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
match axum::extract::Query::<T>::from_request_parts(parts, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_ascii_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Query<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Query<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -1 +1,56 @@
pub mod api;
use axum::{body::Body, response::Response};
use compute_api::responses::{ComputeStatus, GenericAPIError};
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Serialize;
use tracing::error;
pub use server::launch_http_server;
mod extract;
mod routes;
mod server;
/// Convenience response builder for JSON responses
struct JsonResponse;
impl JsonResponse {
/// Helper for actually creating a response
fn create_response(code: StatusCode, body: impl Serialize) -> Response {
Response::builder()
.status(code)
.header(CONTENT_TYPE.as_str(), "application/json")
.body(Body::from(serde_json::to_string(&body).unwrap()))
.unwrap()
}
/// Create a successful error response
pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
assert!({
let code = code.as_u16();
(200..300).contains(&code)
});
Self::create_response(code, body)
}
/// Create an error response
pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
assert!(code.as_u16() >= 400);
let message = error.to_string();
error!(message);
Self::create_response(code, &GenericAPIError { error: message })
}
/// Create an error response related to the compute being in an invalid state
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
Self::create_response(
StatusCode::PRECONDITION_FAILED,
&GenericAPIError {
error: format!("invalid compute status: {status}"),
},
)
}
}

View File

@@ -37,7 +37,7 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/metrics
/metrics:
get:
tags:
- Info

View File

@@ -0,0 +1,20 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
/// Check that the compute is currently running.
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match check_writability(&compute).await {
Ok(_) => JsonResponse::success(StatusCode::OK, true),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -0,0 +1,91 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::ConfigurationRequest,
responses::{ComputeStatus, ComputeStatusResponse},
};
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::{
compute::{ComputeNode, ParsedSpec},
http::{extract::Json, JsonResponse},
};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
// update compute state with new spec, we basically leave compute in the
// potentially wrong state. That said, it's control-plane's responsibility to
// watch compute state after reconfiguration request and to clean restart in
// case of errors.
pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),
);
}
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
Ok(p) => p,
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
};
// XXX: wrap state update under lock in a code block. Otherwise, we will try
// to `Send` `mut state` into the spawned thread bellow, which will cause
// the following rustc error:
//
// error: future cannot be sent between threads safely
{
let mut state = compute.state.lock().unwrap();
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status);
}
state.pspec = Some(pspec);
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
}
// Spawn a blocking thread to wait for compute to become Running. This is
// needed to do not block the main pool of workers and be able to serve
// other requests while some particular request is waiting for compute to
// finish configuration.
let c = compute.clone();
let completed = task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {}",
ComputeStatus::Running,
state.status
);
if state.status == ComputeStatus::Failed {
let err = state.error.as_ref().map_or("unknown error", |x| x);
let msg = format!("compute configuration failed: {:?}", err);
return Err(msg);
}
}
Ok(())
})
.await
.unwrap();
if let Err(e) = completed {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
}
// Return current compute state if everything went well.
let state = compute.state.lock().unwrap().clone();
let body = ComputeStatusResponse::from(&state);
JsonResponse::success(StatusCode::OK, body)
}

View File

@@ -0,0 +1,34 @@
use std::sync::Arc;
use axum::{body::Body, extract::State, response::Response};
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Deserialize;
use crate::{
catalog::{get_database_schema, SchemaDumpError},
compute::ComputeNode,
http::{extract::Query, JsonResponse},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct DatabaseSchemaParams {
database: String,
}
/// Get a schema dump of the requested database.
pub(in crate::http) async fn get_schema_dump(
params: Query<DatabaseSchemaParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
match get_database_schema(&compute, &params.database).await {
Ok(schema) => Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE.as_str(), "application/json")
.body(Body::from_stream(schema))
.unwrap(),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
}
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -0,0 +1,16 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
/// Get the databases and roles from the compute.
pub(in crate::http) async fn get_catalog_objects(
State(compute): State<Arc<ComputeNode>>,
) -> Response {
match get_dbs_and_roles(&compute).await {
Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -0,0 +1,67 @@
use std::sync::Arc;
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use http::StatusCode;
use serde::Deserialize;
use crate::{
compute::ComputeNode,
http::{
extract::{Path, Query},
JsonResponse,
},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct ExtensionServerParams {
is_library: Option<bool>,
}
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
);
}
let ext = {
let state = compute.state.lock().unwrap();
let pspec = state.pspec.as_ref().unwrap();
let spec = &pspec.spec;
let remote_extensions = match spec.remote_extensions.as_ref() {
Some(r) => r,
None => {
return JsonResponse::error(
StatusCode::CONFLICT,
"information about remote extensions is unavailable",
);
}
};
remote_extensions.get_ext(
&filename,
params.is_library.unwrap_or(false),
&compute.build_tag,
&compute.pgversion,
)
};
match ext {
Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
Ok(_) => StatusCode::OK.into_response(),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
},
Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
}
}

View File

@@ -0,0 +1,45 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::ExtensionInstallRequest,
responses::{ComputeStatus, ExtensionInstallResponse},
};
use http::StatusCode;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Install a extension.
pub(in crate::http) async fn install_extension(
State(compute): State<Arc<ComputeNode>>,
request: Json<ExtensionInstallRequest>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match compute
.install_extension(
&request.extension,
&request.database,
request.version.to_string(),
)
.await
{
Ok(version) => JsonResponse::success(
StatusCode::CREATED,
Some(ExtensionInstallResponse {
extension: request.extension.clone(),
version,
}),
),
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to install extension: {e}"),
),
}
}

View File

@@ -0,0 +1,35 @@
use axum::response::{IntoResponse, Response};
use http::StatusCode;
use tracing::info;
use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
use crate::http::{extract::Json, JsonResponse};
/// Configure failpoints for testing purposes.
pub(in crate::http) async fn configure_failpoints(
failpoints: Json<ConfigureFailpointsRequest>,
) -> Response {
if !fail::has_failpoints() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"Cannot manage failpoints because neon was compiled without failpoints support",
);
}
for fp in &*failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(e) = cfg_result {
return JsonResponse::error(
StatusCode::BAD_REQUEST,
format!("failed to configure failpoints: {e}"),
);
}
}
StatusCode::OK.into_response()
}

View File

@@ -0,0 +1,48 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::SetRoleGrantsRequest,
responses::{ComputeStatus, SetRoleGrantsResponse},
};
use http::StatusCode;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Add grants for a role.
pub(in crate::http) async fn add_grant(
State(compute): State<Arc<ComputeNode>>,
request: Json<SetRoleGrantsRequest>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match compute
.set_role_grants(
&request.database,
&request.schema,
&request.privileges,
&request.role,
)
.await
{
Ok(()) => JsonResponse::success(
StatusCode::CREATED,
Some(SetRoleGrantsResponse {
database: request.database.clone(),
schema: request.schema.clone(),
role: request.role.clone(),
privileges: request.privileges.clone(),
}),
),
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to grant role privileges to the schema: {e}"),
),
}
}

View File

@@ -0,0 +1,11 @@
use axum::response::Response;
use compute_api::responses::InfoResponse;
use http::StatusCode;
use crate::http::JsonResponse;
/// Get information about the physical characteristics about the compute.
pub(in crate::http) async fn get_info() -> Response {
let num_cpus = num_cpus::get_physical();
JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
}

View File

@@ -0,0 +1,18 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Collect current Postgres usage insights.
pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
let insights = compute.collect_insights().await;
JsonResponse::success(StatusCode::OK, insights)
}

View File

@@ -0,0 +1,33 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use tokio::task;
use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
/// Get a list of installed extensions.
pub(in crate::http) async fn get_installed_extensions(
State(compute): State<Arc<ComputeNode>>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
let conf = compute.get_conn_conf(None);
let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
.await
.unwrap();
match res {
Ok(installed_extensions) => {
JsonResponse::success(StatusCode::OK, Some(installed_extensions))
}
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to get list of installed extensions: {e}"),
),
}
}

View File

@@ -0,0 +1,32 @@
use axum::{body::Body, response::Response};
use http::header::CONTENT_TYPE;
use http::StatusCode;
use metrics::proto::MetricFamily;
use metrics::Encoder;
use metrics::TextEncoder;
use crate::{http::JsonResponse, installed_extensions};
/// Expose Prometheus metrics.
pub(in crate::http) async fn get_metrics() -> Response {
// When we call TextEncoder::encode() below, it will immediately return an
// error if a metric family has no metrics, so we need to preemptively
// filter out metric families with no metrics.
let metrics = installed_extensions::collect()
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
let mut buffer = vec![];
if let Err(e) = encoder.encode(&metrics, &mut buffer) {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
}
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
.unwrap()
}

View File

@@ -0,0 +1,12 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Get startup metrics.
pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
let metrics = compute.state.lock().unwrap().metrics.clone();
JsonResponse::success(StatusCode::OK, metrics)
}

View File

@@ -0,0 +1,38 @@
use compute_api::responses::ComputeStatusResponse;
use crate::compute::ComputeState;
pub(in crate::http) mod check_writability;
pub(in crate::http) mod configure;
pub(in crate::http) mod database_schema;
pub(in crate::http) mod dbs_and_roles;
pub(in crate::http) mod extension_server;
pub(in crate::http) mod extensions;
pub(in crate::http) mod failpoints;
pub(in crate::http) mod grants;
pub(in crate::http) mod info;
pub(in crate::http) mod insights;
pub(in crate::http) mod installed_extensions;
pub(in crate::http) mod metrics;
pub(in crate::http) mod metrics_json;
pub(in crate::http) mod status;
pub(in crate::http) mod terminate;
impl From<&ComputeState> for ComputeStatusResponse {
fn from(state: &ComputeState) -> Self {
ComputeStatusResponse {
start_time: state.start_time,
tenant: state
.pspec
.as_ref()
.map(|pspec| pspec.tenant_id.to_string()),
timeline: state
.pspec
.as_ref()
.map(|pspec| pspec.timeline_id.to_string()),
status: state.status,
last_active: state.last_active,
error: state.error.clone(),
}
}
}

View File

@@ -0,0 +1,14 @@
use std::{ops::Deref, sync::Arc};
use axum::{extract::State, http::StatusCode, response::Response};
use compute_api::responses::ComputeStatusResponse;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Retrieve the state of the comute.
pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
let state = compute.state.lock().unwrap();
let body = ComputeStatusResponse::from(state.deref());
JsonResponse::success(StatusCode::OK, body)
}

View File

@@ -0,0 +1,58 @@
use std::sync::Arc;
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::{
compute::{forward_termination_signal, ComputeNode},
http::JsonResponse,
};
/// Terminate the compute.
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return StatusCode::CREATED.into_response();
}
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status);
}
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {:?}",
ComputeStatus::Terminated,
state.status
);
}
})
.await
.unwrap();
info!("terminated Postgres");
StatusCode::OK.into_response()
}

View File

@@ -0,0 +1,165 @@
use std::{
net::{IpAddr, Ipv6Addr, SocketAddr},
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
thread,
time::Duration,
};
use anyhow::Result;
use axum::{
response::{IntoResponse, Response},
routing::{get, post},
Router,
};
use http::StatusCode;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
trace::TraceLayer,
};
use tracing::{debug, error, info, Span};
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
terminate,
};
use crate::compute::ComputeNode;
async fn handle_404() -> Response {
StatusCode::NOT_FOUND.into_response()
}
#[derive(Clone, Default)]
struct ComputeMakeRequestId(Arc<AtomicU64>);
impl MakeRequestId for ComputeMakeRequestId {
fn make_request_id<B>(
&mut self,
_request: &http::Request<B>,
) -> Option<tower_http::request_id::RequestId> {
let request_id = self
.0
.fetch_add(1, Ordering::SeqCst)
.to_string()
.parse()
.unwrap();
Some(RequestId::new(request_id))
}
}
/// Run the HTTP server and wait on it forever.
#[tokio::main]
async fn serve(port: u16, compute: Arc<ComputeNode>) {
const X_REQUEST_ID: &str = "x-request-id";
let mut app = Router::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route(
"/extension_server/*filename",
post(extension_server::download_extension),
)
.route("/extensions", post(extensions::install_extension))
.route("/grants", post(grants::add_grant))
.route("/info", get(info_route::get_info))
.route("/insights", get(insights::get_insights))
.route(
"/installed_extensions",
get(installed_extensions::get_installed_extensions),
)
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.fallback(handle_404)
.layer(
ServiceBuilder::new()
.layer(SetRequestIdLayer::x_request_id(
ComputeMakeRequestId::default(),
))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
.with_state(compute);
// Add in any testing support
if cfg!(feature = "testing") {
use super::routes::failpoints;
app = app.route("/failpoints", post(failpoints::configure_failpoints))
}
// This usually binds to both IPv4 and IPv6 on Linux, see
// https://github.com/rust-lang/rust/pull/34440 for more information
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let listener = match TcpListener::bind(&addr).await {
Ok(listener) => listener,
Err(e) => {
error!(
"failed to bind the compute_ctl HTTP server to port {}: {}",
port, e
);
return;
}
};
if let Ok(local_addr) = listener.local_addr() {
info!("compute_ctl HTTP server listening on {}", local_addr);
} else {
info!("compute_ctl HTTP server listening on port {}", port);
}
if let Err(e) = axum::serve(listener, app).await {
error!("compute_ctl HTTP server error: {}", e);
}
}
/// Launch a separate HTTP server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-server".into())
.spawn(move || serve(port, state))?)
}

View File

@@ -3,8 +3,6 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
extern crate hyper0 as hyper;
pub mod checker;
pub mod config;
pub mod configurator;

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result};
use fail::fail_point;
use postgres::Client;
use postgres::{Client, Transaction};
use tracing::info;
/// Runs a series of migrations on a target database
@@ -20,11 +20,9 @@ impl<'m> MigrationRunner<'m> {
/// Get the current value neon_migration.migration_id
fn get_migration_id(&mut self) -> Result<i64> {
let query = "SELECT id FROM neon_migration.migration_id";
let row = self
.client
.query_one(query, &[])
.context("run_migrations get migration_id")?;
.query_one("SELECT id FROM neon_migration.migration_id", &[])?;
Ok(row.get::<&str, i64>("id"))
}
@@ -34,7 +32,7 @@ impl<'m> MigrationRunner<'m> {
/// This function has a fail point called compute-migration, which can be
/// used if you would like to fail the application of a series of migrations
/// at some point.
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
// We use this fail point in order to check that failing in the
// middle of applying a series of migrations fails in an expected
// manner
@@ -55,12 +53,11 @@ impl<'m> MigrationRunner<'m> {
}
}
self.client
.query(
"UPDATE neon_migration.migration_id SET id = $1",
&[&migration_id],
)
.context("run_migrations update id")?;
txn.query(
"UPDATE neon_migration.migration_id SET id = $1",
&[&migration_id],
)
.with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
Ok(())
}
@@ -81,53 +78,50 @@ impl<'m> MigrationRunner<'m> {
Ok(())
}
/// Run the configrured set of migrations
/// Run an individual migration
fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", migration_id);
// Even though we are skipping the migration, updating the
// migration ID should help keep logic easy to understand when
// trying to understand the state of a cluster.
Self::update_migration_id(txn, migration_id)?;
} else {
info!("Running migration id={}:\n{}\n", migration_id, migration);
txn.simple_query(migration)
.with_context(|| format!("apply migration {migration_id}"))?;
Self::update_migration_id(txn, migration_id)?;
}
Ok(())
}
/// Run the configured set of migrations
pub fn run_migrations(mut self) -> Result<()> {
self.prepare_database()?;
self.prepare_database()
.context("prepare database to handle migrations")?;
let mut current_migration = self.get_migration_id()? as usize;
while current_migration < self.migrations.len() {
macro_rules! migration_id {
($cm:expr) => {
($cm + 1) as i64
};
}
// The index lags the migration ID by 1, so the current migration
// ID is also the next index
let migration_id = (current_migration + 1) as i64;
let migration = self.migrations[current_migration];
let mut txn = self
.client
.transaction()
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", migration_id!(current_migration));
Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
.with_context(|| format!("running migration {migration_id}"))?;
// Even though we are skipping the migration, updating the
// migration ID should help keep logic easy to understand when
// trying to understand the state of a cluster.
self.update_migration_id(migration_id!(current_migration))?;
} else {
info!(
"Running migration id={}:\n{}\n",
migration_id!(current_migration),
migration
);
txn.commit()
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
self.client
.simple_query("BEGIN")
.context("begin migration")?;
self.client.simple_query(migration).with_context(|| {
format!(
"run_migrations migration id={}",
migration_id!(current_migration)
)
})?;
self.update_migration_id(migration_id!(current_migration))?;
self.client
.simple_query("COMMIT")
.context("commit migration")?;
info!("Finished migration id={}", migration_id!(current_migration));
}
info!("Finished migration id={}", migration_id);
current_migration += 1;
}

View File

@@ -47,6 +47,7 @@ pub enum PerDatabasePhase {
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
DropSubscriptionsForDeletedDatabases,
}
#[derive(Clone, Debug)]
@@ -74,7 +75,7 @@ pub struct MutableApplyContext {
pub dbs: HashMap<String, Database>,
}
/// Appply the operations that belong to the given spec apply phase.
/// Apply the operations that belong to the given spec apply phase.
///
/// Commands within a single phase are executed in order of Iterator yield.
/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -326,13 +327,12 @@ async fn get_operations<'a>(
// Use FORCE to drop database even if there are active connections.
// We run this from `cloud_admin`, so it should have enough privileges.
//
// NB: there could be other db states, which prevent us from dropping
// the database. For example, if db is used by any active subscription
// or replication slot.
// TODO: deal with it once we allow logical replication. Proper fix should
// involve returning an error code to the control plane, so it could
// figure out that this is a non-retryable error, return it to the user
// and fail operation permanently.
// Such cases are handled in the DropSubscriptionsForDeletedDatabases
// phase. We do all the cleanup before actually dropping the database.
let drop_db_query: String = format!(
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
&op.name.pg_quote()
@@ -444,6 +444,30 @@ async fn get_operations<'a>(
}
ApplySpecPhase::RunInEachDatabase { db, subphase } => {
match subphase {
PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
match &db {
DB::UserDB(db) => {
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscription_for_drop_dbs.sql"),
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
query: drop_subscription_query,
comment: Some(format!(
"optionally dropping subscriptions for DB {}",
db.name,
)),
}]
.into_iter();
Ok(Box::new(operations))
}
// skip this cleanup for the system databases
// because users can't drop them
DB::SystemDB => Ok(Box::new(empty())),
}
}
PerDatabasePhase::DeleteDBRoleReferences => {
let ctx = ctx.read().await;
@@ -474,7 +498,19 @@ async fn get_operations<'a>(
),
comment: None,
},
// Revoke some potentially blocking privileges (Neon-specific currently)
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
role_name = quoted,
),
comment: None,
},
// This now will only drop privileges of the role
// TODO: this is obviously not 100% true because of the above case,
// there could be still some privileges that are not revoked. Maybe this
// only drops privileges that were granted *by this* role, not *to this* role,
// but this has to be checked.
Operation {
query: format!("DROP OWNED BY {}", quoted),
comment: None,

View File

@@ -0,0 +1,11 @@
DO $$
DECLARE
subname TEXT;
BEGIN
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
$$;

View File

@@ -0,0 +1,28 @@
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
SELECT schema_name
FROM information_schema.schemata
-- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
-- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
-- See https://github.com/neondatabase/cloud/issues/13582 for the context.
-- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
EXECUTE revoke_query;
END LOOP;
END;
$$;
RESET ROLE;

View File

@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
use crate::storage_controller::StorageController;
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
@@ -739,7 +739,7 @@ impl Endpoint {
}
// Call the /status HTTP API
pub async fn get_status(&self) -> Result<ComputeState> {
pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
let client = reqwest::Client::new();
let response = client

View File

@@ -1035,7 +1035,15 @@ async fn main() -> anyhow::Result<()> {
resp.sort_by(|a, b| a.id.cmp(&b.id));
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
table.set_header([
"Id",
"Version",
"Host",
"Port",
"Http Port",
"AZ Id",
"Scheduling",
]);
for sk in resp {
table.add_row([
format!("{}", sk.id),
@@ -1043,7 +1051,8 @@ async fn main() -> anyhow::Result<()> {
sk.host,
format!("{}", sk.port),
format!("{}", sk.http_port),
sk.availability_zone_id.to_string(),
sk.availability_zone_id.clone(),
String::from(sk.scheduling_policy),
]);
}
println!("{table}");

View File

@@ -15,6 +15,17 @@ pub struct GenericAPIError {
pub error: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct InfoResponse {
pub num_cpus: usize,
}
#[derive(Debug, Clone, Serialize)]
pub struct ExtensionInstallResponse {
pub extension: PgIdent,
pub version: ExtVersion,
}
/// Response of the /status API
#[derive(Serialize, Debug, Deserialize)]
#[serde(rename_all = "snake_case")]
@@ -28,16 +39,6 @@ pub struct ComputeStatusResponse {
pub error: Option<String>,
}
#[derive(Deserialize, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
#[serde(serialize_with = "rfc3339_serialize")]
pub last_active: Option<DateTime<Utc>>,
pub error: Option<String>,
}
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
@@ -78,7 +79,7 @@ impl Display for ComputeStatus {
}
}
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{

View File

@@ -320,6 +320,38 @@ impl From<NodeSchedulingPolicy> for String {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Disabled,
Decomissioned,
}
impl FromStr for SkSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"disabled" => Self::Disabled,
"decomissioned" => Self::Decomissioned,
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
})
}
}
impl From<SkSchedulingPolicy> for String {
fn from(value: SkSchedulingPolicy) -> String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Disabled => "disabled",
Decomissioned => "decomissioned",
}
.to_string()
}
}
/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
/// to create secondary locations.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -387,6 +419,7 @@ pub struct SafekeeperDescribeResponse {
pub port: i32,
pub http_port: i32,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
#[cfg(test)]

View File

@@ -1460,75 +1460,91 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
// difference in the responses between V1 and V2.
//
#[derive(Clone, Copy)]
// V3 version of protocol adds request ID to all requests. This request ID is also included in response
// as well as other fields from requests, which allows to verify that we receive response for our request.
// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
//
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum PagestreamProtocolVersion {
V2,
V3,
}
#[derive(Debug, PartialEq, Eq)]
pub type RequestId = u64;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamRequest {
pub reqid: RequestId,
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamExistsRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub hdr: PagestreamRequest,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamNblocksRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub hdr: PagestreamRequest,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamGetPageRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub hdr: PagestreamRequest,
pub rel: RelTag,
pub blkno: u32,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamDbSizeRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub hdr: PagestreamRequest,
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamGetSlruSegmentRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub hdr: PagestreamRequest,
pub kind: u8,
pub segno: u32,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub req: PagestreamExistsRequest,
pub exists: bool,
}
#[derive(Debug)]
pub struct PagestreamNblocksResponse {
pub req: PagestreamNblocksRequest,
pub n_blocks: u32,
}
#[derive(Debug)]
pub struct PagestreamGetPageResponse {
pub req: PagestreamGetPageRequest,
pub page: Bytes,
}
#[derive(Debug)]
pub struct PagestreamGetSlruSegmentResponse {
pub req: PagestreamGetSlruSegmentRequest,
pub segment: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub req: PagestreamRequest,
pub message: String,
}
#[derive(Debug)]
pub struct PagestreamDbSizeResponse {
pub req: PagestreamDbSizeRequest,
pub db_size: i64,
}
@@ -1545,15 +1561,16 @@ pub struct TenantHistorySize {
impl PagestreamFeMessage {
/// Serialize a compute -> pageserver message. This is currently only used in testing
/// tools. Always uses protocol version 2.
/// tools. Always uses protocol version 3.
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1562,8 +1579,9 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1572,8 +1590,9 @@ impl PagestreamFeMessage {
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1583,15 +1602,17 @@ impl PagestreamFeMessage {
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
@@ -1600,21 +1621,35 @@ impl PagestreamFeMessage {
bytes.into()
}
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
pub fn parse<R: std::io::Read>(
body: &mut R,
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
// these two fields are the same for every request type
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
let (reqid, request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
0,
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V3 => (
body.read_u64::<BigEndian>()?,
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
};
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn,
not_modified_since,
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1623,8 +1658,11 @@ impl PagestreamFeMessage {
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn,
not_modified_since,
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1633,8 +1671,11 @@ impl PagestreamFeMessage {
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1644,14 +1685,20 @@ impl PagestreamFeMessage {
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn,
not_modified_since,
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
request_lsn,
not_modified_since,
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
@@ -1662,43 +1709,114 @@ impl PagestreamFeMessage {
}
impl PagestreamBeMessage {
pub fn serialize(&self) -> Bytes {
pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
let mut bytes = BytesMut::new();
use PagestreamBeMessageTag as Tag;
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u8(resp.exists as u8);
}
match protocol_version {
PagestreamProtocolVersion::V2 => {
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u32(resp.n_blocks);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put(&resp.page[..]);
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put(&resp.page[..])
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
}
PagestreamProtocolVersion::V3 => {
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u32(resp.req.blkno);
bytes.put(&resp.page[..])
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put_u64(resp.req.reqid);
bytes.put_u64(resp.req.request_lsn.0);
bytes.put_u64(resp.req.not_modified_since.0);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.dbnode);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u8(resp.req.kind);
bytes.put_u32(resp.req.segno);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
}
}
bytes.into()
}
@@ -1710,38 +1828,131 @@ impl PagestreamBeMessage {
let ok =
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
Tag::Exists => {
let exists = buf.read_u8()?;
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let exists = buf.read_u8()? != 0;
Self::Exists(PagestreamExistsResponse {
exists: exists != 0,
req: PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
},
exists,
})
}
Tag::Nblocks => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let n_blocks = buf.read_u32::<BigEndian>()?;
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
Self::Nblocks(PagestreamNblocksResponse {
req: PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
},
n_blocks,
})
}
Tag::GetPage => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let blkno = buf.read_u32::<BigEndian>()?;
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
buf.read_exact(&mut page)?;
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
Self::GetPage(PagestreamGetPageResponse {
req: PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
blkno,
},
page: page.into(),
})
}
Tag::Error => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let mut msg = Vec::new();
buf.read_until(0, &mut msg)?;
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
let rust_str = cstring.to_str()?;
PagestreamBeMessage::Error(PagestreamErrorResponse {
Self::Error(PagestreamErrorResponse {
req: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
message: rust_str.to_owned(),
})
}
Tag::DbSize => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let dbnode = buf.read_u32::<BigEndian>()?;
let db_size = buf.read_i64::<BigEndian>()?;
Self::DbSize(PagestreamDbSizeResponse { db_size })
Self::DbSize(PagestreamDbSizeResponse {
req: PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
dbnode,
},
db_size,
})
}
Tag::GetSlruSegment => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let kind = buf.read_u8()?;
let segno = buf.read_u32::<BigEndian>()?;
let n_blocks = buf.read_u32::<BigEndian>()?;
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
buf.read_exact(&mut segment)?;
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
req: PagestreamGetSlruSegmentRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
kind,
segno,
},
segment: segment.into(),
})
}
@@ -1780,8 +1991,11 @@ mod tests {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1790,8 +2004,11 @@ mod tests {
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
},
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1800,8 +2017,11 @@ mod tests {
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1811,14 +2031,19 @@ mod tests {
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
let reconstructed =
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
.unwrap();
assert!(msg == reconstructed);
}
}

View File

@@ -115,13 +115,15 @@ fn default_max_keys_per_list_response() -> Option<i32> {
}
fn default_azure_conn_pool_size() -> usize {
// Conservative default: no connection pooling. At time of writing this is the Azure
// SDK's default as well, due to historic reports of hard-to-reproduce issues
// By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
// (https://github.com/hyperium/hyper/issues/2312)
//
// However, using connection pooling is important to avoid exhausting client ports when
// doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
0
//
// We therefore enable a modest pool size by default: this may be configured to zero if
// issues like the alleged upstream hyper issue appear.
8
}
impl Debug for S3Config {

View File

@@ -38,7 +38,6 @@ pub mod http;
use opentelemetry::trace::TracerProvider;
use opentelemetry::KeyValue;
use opentelemetry_sdk::Resource;
use tracing::Subscriber;
use tracing_subscriber::registry::LookupSpan;
use tracing_subscriber::Layer;
@@ -121,7 +120,10 @@ where
S: Subscriber + for<'span> LookupSpan<'span>,
{
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
let exporter = opentelemetry_otlp::new_exporter().http();
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.build()
.expect("could not initialize opentelemetry exporter");
// TODO: opentelemetry::global::set_error_handler() with custom handler that
// bypasses default tracing layers, but logs regular looking log
@@ -132,17 +134,13 @@ where
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
);
let tracer = opentelemetry_otlp::new_pipeline()
.tracing()
.with_exporter(exporter)
.with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)]),
))
.install_batch(opentelemetry_sdk::runtime::Tokio)
.expect("could not initialize opentelemetry exporter")
let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
.with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
.with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)]))
.build()
.tracer("global");
tracing_opentelemetry::layer().with_tracer(tracer)

View File

@@ -26,6 +26,7 @@ git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper0 = { workspace = true, features = ["full"] }
inferno.workspace = true
itertools.workspace = true
fail.workspace = true
futures = { workspace = true }

View File

@@ -417,6 +417,7 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
enum Format {
Jemalloc,
Pprof,
Svg,
}
// Parameters.
@@ -424,9 +425,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
None => Format::Pprof,
Some("jemalloc") => Format::Jemalloc,
Some("pprof") => Format::Pprof,
Some("svg") => Format::Svg,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
// Functions and mappings to strip when symbolizing pprof profiles. If true,
// also remove child frames.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
// Obtain profiler handle.
let mut prof_ctl = jemalloc_pprof::PROF_CTL
.as_ref()
@@ -464,24 +480,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
// Functions to strip from profiles. If true, also remove child frames.
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(
profile,
&["libc", "libgcc", "pthread", "vdso"],
&STRIP_FUNCTIONS,
);
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
pprof::encode(&profile)
})
.await
@@ -494,6 +495,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
.body(Body::from(data))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();
pprof::flamegraph(profile, &mut opts)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}
}

View File

@@ -1,8 +1,9 @@
use anyhow::bail;
use flate2::write::{GzDecoder, GzEncoder};
use flate2::Compression;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pprof::protos::{Function, Line, Message as _, Profile};
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use regex::Regex;
use std::borrow::Cow;
@@ -188,3 +189,59 @@ pub fn strip_locations(
profile
}
/// Generates an SVG flamegraph from a symbolized pprof profile.
pub fn flamegraph(
profile: Profile,
opts: &mut inferno::flamegraph::Options,
) -> anyhow::Result<Vec<u8>> {
if profile.mapping.iter().any(|m| !m.has_functions) {
bail!("profile not symbolized");
}
// Index locations, functions, and strings.
let locations: HashMap<u64, Location> =
profile.location.into_iter().map(|l| (l.id, l)).collect();
let functions: HashMap<u64, Function> =
profile.function.into_iter().map(|f| (f.id, f)).collect();
let strings = profile.string_table;
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
// since inferno expects it bottom-up.
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
for sample in profile.sample {
let mut stack = Vec::with_capacity(sample.location_id.len());
for location in sample.location_id.into_iter().rev() {
let Some(location) = locations.get(&location) else {
bail!("missing location {location}");
};
for line in location.line.iter().rev() {
let Some(function) = functions.get(&line.function_id) else {
bail!("missing function {}", line.function_id);
};
let Some(name) = strings.get(function.name as usize) else {
bail!("missing string {}", function.name);
};
stack.push(name.as_str());
}
}
let Some(&value) = sample.value.first() else {
bail!("missing value");
};
*stacks.entry(stack).or_default() += value;
}
// Construct stack lines for inferno.
let lines = stacks
.into_iter()
.map(|(stack, value)| (stack.into_iter().join(";"), value))
.map(|(stack, value)| format!("{stack} {value}"))
.sorted()
.collect_vec();
// Construct the flamegraph.
let mut bytes = Vec::new();
let lines = lines.iter().map(|line| line.as_str());
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
Ok(bytes)
}

View File

@@ -96,7 +96,11 @@ impl<T: Send> Sender<T> {
}
}
State::SenderWaitsForReceiverToConsume(_data) => {
// Really, we shouldn't be polled until receiver has consumed and wakes us.
// SAFETY: send is single threaded due to `&mut self` requirement,
// therefore register is not concurrent.
unsafe {
self.state.wake_sender.register(cx.waker());
}
Poll::Pending
}
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
@@ -449,4 +453,38 @@ mod tests {
let err = recv_task.await.unwrap().expect_err("should error");
assert!(matches!(err, RecvError::SenderGone));
}
#[tokio::test(start_paused = true)]
async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
let (mut sender, receiver) = channel();
let state = receiver.state.clone();
sender.send((), |_, _| unreachable!()).await.unwrap();
assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
let unmergeable = sender.send((), |_, _| Err(()));
let mut unmergeable = std::pin::pin!(unmergeable);
tokio::select! {
_ = tokio::time::sleep(FOREVER) => {},
_ = &mut unmergeable => {
panic!("unmergeable should not complete");
},
}
assert!(matches!(
&*state.value.lock().unwrap(),
&State::SenderWaitsForReceiverToConsume(_)
));
drop(receiver);
assert!(matches!(
&*state.value.lock().unwrap(),
&State::ReceiverGone
));
unmergeable.await.unwrap_err();
}
}

View File

@@ -95,6 +95,14 @@ impl InterpretedWalRecord {
&& self.metadata_record.is_none()
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
}
/// Checks if the WAL record is observed (i.e. contains only metadata
/// for observed values)
pub fn is_observed(&self) -> bool {
self.batch.is_observed()
&& self.metadata_record.is_none()
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
}
}
/// The interpreted part of the Postgres WAL record which requires metadata

View File

@@ -501,6 +501,11 @@ impl SerializedValueBatch {
!self.has_data() && self.metadata.is_empty()
}
/// Checks if the batch contains only observed values
pub fn is_observed(&self) -> bool {
!self.has_data() && !self.metadata.is_empty()
}
/// Checks if the batch contains data
///
/// Note that if this returns false, it may still contain observed values or

View File

@@ -60,7 +60,7 @@ impl Client {
) -> anyhow::Result<PagestreamClient> {
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
.client
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
.await?;
let Client {
cancel_on_client_drop,

View File

@@ -2,7 +2,7 @@ use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken;
@@ -322,12 +322,15 @@ async fn main_impl(
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
hdr: PagestreamRequest {
reqid: 0,
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
},
not_modified_since: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}

View File

@@ -53,12 +53,10 @@ project_build_tag!(BUILD_TAG);
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
// TODO: disabled because concurrent CPU profiles cause seg faults. See:
// https://github.com/neondatabase/neon/issues/10225.
//#[allow(non_upper_case_globals)]
//#[export_name = "malloc_conf"]
//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
const PID_FILE_NAME: &str = "pageserver.pid";

View File

@@ -1854,6 +1854,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
pub(crate) enum ComputeCommandKind {
PageStreamV3,
PageStreamV2,
Basebackup,
Fullbackup,
@@ -2337,13 +2338,15 @@ macro_rules! redo_bytes_histogram_count_buckets {
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_observed: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
pub(crate) clear_vm_bits_unknown: IntCounterVec,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
@@ -2354,6 +2357,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_observed: register_int_counter!(
"pageserver_wal_ingest_records_observed",
"Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
@@ -2375,6 +2383,7 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
&["entity"],
)
.expect("failed to define a metric"),
}
});
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {

View File

@@ -17,7 +17,7 @@ use pageserver_api::models::{
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
PagestreamProtocolVersion,
PagestreamProtocolVersion, PagestreamRequest,
};
use pageserver_api::shard::TenantShardId;
use postgres_backend::{
@@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::{basebackup, timed_after_cancellation};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -537,6 +537,23 @@ impl From<WaitLsnError> for QueryError {
}
}
#[derive(thiserror::Error, Debug)]
struct BatchedPageStreamError {
req: PagestreamRequest,
err: PageStreamError,
}
impl std::fmt::Display for BatchedPageStreamError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.err.fmt(f)
}
}
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
}
enum BatchedFeMessage {
Exists {
span: Span,
@@ -554,7 +571,7 @@ enum BatchedFeMessage {
span: Span,
shard: timeline::handle::Handle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
},
DbSize {
span: Span,
@@ -570,7 +587,7 @@ enum BatchedFeMessage {
},
RespondError {
span: Span,
error: PageStreamError,
error: BatchedPageStreamError,
},
}
@@ -595,12 +612,15 @@ impl BatchedFeMessage {
BatchedFeMessage::GetPage { shard, pages, .. } => (
shard,
pages.len(),
itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
),
BatchedFeMessage::RespondError { .. } => return Ok(()),
};
let throttled = tokio::select! {
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
_ = shard.cancel.cancelled() => {
return Err(QueryError::Shutdown);
}
_ = cancel.cancelled() => {
return Err(QueryError::Shutdown);
}
@@ -654,6 +674,7 @@ impl PageServerHandler {
)
}
#[allow(clippy::too_many_arguments)]
async fn pagestream_read_message<IO>(
pgb: &mut PostgresBackendReader<IO>,
tenant_id: TenantId,
@@ -661,6 +682,7 @@ impl PageServerHandler {
timeline_handles: &mut TimelineHandles,
cancel: &CancellationToken,
ctx: &RequestContext,
protocol_version: PagestreamProtocolVersion,
parent_span: Span,
) -> Result<Option<BatchedFeMessage>, QueryError>
where
@@ -695,11 +717,12 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message");
// parse request
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let neon_fe_msg =
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
let batched_msg = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -715,7 +738,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::Nblocks(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -731,7 +754,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::DbSize(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -747,7 +770,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::GetSlruSegment(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -762,25 +785,23 @@ impl PageServerHandler {
req,
}
}
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
rel,
blkno,
}) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
PagestreamFeMessage::GetPage(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
macro_rules! respond_error {
($error:expr) => {{
let error = BatchedFeMessage::RespondError {
span,
error: $error,
error: BatchedPageStreamError {
req: req.hdr,
err: $error,
},
};
Ok(Some(error))
}};
}
let key = rel_block_to_key(rel, blkno);
let key = rel_block_to_key(req.rel, req.blkno);
let shard = match timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Page(key))
.instrument(span.clone()) // sets `shard_id` field
@@ -814,8 +835,8 @@ impl PageServerHandler {
let effective_request_lsn = match Self::wait_or_get_last_lsn(
&shard,
request_lsn,
not_modified_since,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_latest_gc_cutoff_lsn(),
ctx,
)
@@ -831,7 +852,7 @@ impl PageServerHandler {
span,
shard,
effective_request_lsn,
pages: smallvec::smallvec![(rel, blkno, timer)],
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
}
}
};
@@ -910,6 +931,7 @@ impl PageServerHandler {
pgb_writer: &mut PostgresBackend<IO>,
batch: BatchedFeMessage,
cancel: &CancellationToken,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
@@ -917,7 +939,7 @@ impl PageServerHandler {
{
// invoke handler function
let (handler_results, span): (
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
_,
) = match batch {
BatchedFeMessage::Exists {
@@ -932,7 +954,8 @@ impl PageServerHandler {
.handle_get_rel_exists_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))],
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
span,
)
}
@@ -948,7 +971,8 @@ impl PageServerHandler {
.handle_get_nblocks_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))],
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
span,
)
}
@@ -990,7 +1014,8 @@ impl PageServerHandler {
.handle_db_size_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))],
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
span,
)
}
@@ -1006,7 +1031,8 @@ impl PageServerHandler {
.handle_get_slru_segment_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))],
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
span,
)
}
@@ -1022,7 +1048,7 @@ impl PageServerHandler {
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
for handler_result in handler_results {
let (response_msg, timer) = match handler_result {
Err(e) => match &e {
Err(e) => match &e.err {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
@@ -1041,13 +1067,14 @@ impl PageServerHandler {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e);
let full = utils::error::report_compact_sources(&e.err);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
(
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
req: e.req,
message: e.err.to_string(),
}),
None, // TODO: measure errors
)
@@ -1060,7 +1087,9 @@ impl PageServerHandler {
// marshal & transmit response message
//
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
pgb_writer.write_message_noflush(&BeMessage::CopyData(
&response_msg.serialize(protocol_version),
))?;
// We purposefully don't count flush time into the timer.
//
@@ -1123,7 +1152,7 @@ impl PageServerHandler {
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
_protocol_version: PagestreamProtocolVersion,
protocol_version: PagestreamProtocolVersion,
ctx: RequestContext,
) -> Result<(), QueryError>
where
@@ -1163,6 +1192,7 @@ impl PageServerHandler {
timeline_handles,
request_span,
pipelining_config,
protocol_version,
&ctx,
)
.await
@@ -1175,6 +1205,7 @@ impl PageServerHandler {
timeline_id,
timeline_handles,
request_span,
protocol_version,
&ctx,
)
.await
@@ -1201,6 +1232,7 @@ impl PageServerHandler {
timeline_id: TimelineId,
mut timeline_handles: TimelineHandles,
request_span: Span,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -1218,6 +1250,7 @@ impl PageServerHandler {
&mut timeline_handles,
&cancel,
ctx,
protocol_version,
request_span.clone(),
)
.await;
@@ -1238,7 +1271,7 @@ impl PageServerHandler {
}
let err = self
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
.await;
match err {
Ok(()) => {}
@@ -1261,6 +1294,7 @@ impl PageServerHandler {
mut timeline_handles: TimelineHandles,
request_span: Span,
pipelining_config: PageServicePipeliningConfigPipelined,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -1358,6 +1392,7 @@ impl PageServerHandler {
&mut timeline_handles,
&cancel_batcher,
&ctx,
protocol_version,
request_span.clone(),
)
.await;
@@ -1403,8 +1438,14 @@ impl PageServerHandler {
batch
.throttle_and_record_start_processing(&self.cancel)
.await?;
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
.await?;
self.pagesteam_handle_batched_message(
pgb_writer,
batch,
&cancel,
protocol_version,
&ctx,
)
.await?;
}
}
});
@@ -1578,8 +1619,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1590,6 +1631,7 @@ impl PageServerHandler {
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
req: *req,
exists,
}))
}
@@ -1604,8 +1646,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1616,6 +1658,7 @@ impl PageServerHandler {
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
req: *req,
n_blocks,
}))
}
@@ -1630,8 +1673,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1643,6 +1686,7 @@ impl PageServerHandler {
let db_size = total_blocks as i64 * BLCKSZ as i64;
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
req: *req,
db_size,
}))
}
@@ -1652,9 +1696,9 @@ impl PageServerHandler {
&mut self,
timeline: &Timeline,
effective_lsn: Lsn,
requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
debug_assert_current_span_has_tenant_and_timeline_id();
timeline
@@ -1663,7 +1707,7 @@ impl PageServerHandler {
let results = timeline
.get_rel_page_at_lsn_batched(
requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
effective_lsn,
ctx,
)
@@ -1675,16 +1719,20 @@ impl PageServerHandler {
requests
.into_iter()
.zip(results.into_iter())
.map(|((_, _, timer), res)| {
.map(|(req, res)| {
res.map(|page| {
(
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
req: req.req,
page,
}),
timer,
req.timer,
)
})
.map_err(PageStreamError::from)
.map_err(|e| BatchedPageStreamError {
err: PageStreamError::from(e),
req: req.req.hdr,
})
}),
)
}
@@ -1699,8 +1747,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1711,7 +1759,7 @@ impl PageServerHandler {
let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
Ok(PagestreamBeMessage::GetSlruSegment(
PagestreamGetSlruSegmentResponse { segment },
PagestreamGetSlruSegmentResponse { req: *req, segment },
))
}
@@ -1906,6 +1954,7 @@ struct FullBackupCmd {
struct PageStreamCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
protocol_version: PagestreamProtocolVersion,
}
/// `lease lsn tenant timeline lsn`
@@ -1926,7 +1975,7 @@ enum PageServiceCmd {
}
impl PageStreamCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 2 {
bail!(
@@ -1941,6 +1990,7 @@ impl PageStreamCmd {
Ok(Self {
tenant_id,
timeline_id,
protocol_version,
})
}
}
@@ -2078,7 +2128,14 @@ impl PageServiceCmd {
bail!("cannot parse query: {query}")
};
match cmd.to_ascii_lowercase().as_str() {
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(
other,
PagestreamProtocolVersion::V2,
)?)),
"pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse(
other,
PagestreamProtocolVersion::V3,
)?)),
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
"lease" => {
@@ -2160,25 +2217,21 @@ where
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id,
protocol_version,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
let command_kind = match protocol_version {
PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
};
COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
.await?;
}
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
@@ -2357,7 +2410,8 @@ mod tests {
cmd,
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id
timeline_id,
protocol_version: PagestreamProtocolVersion::V2,
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();

View File

@@ -627,7 +627,7 @@ impl Timeline {
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;
let cmp = self
let cmp = match self
.is_latest_commit_timestamp_ge_than(
search_timestamp,
Lsn(mid * 8),
@@ -635,7 +635,16 @@ impl Timeline {
&mut found_larger,
ctx,
)
.await?;
.await
{
Ok(res) => res,
Err(PageReconstructError::MissingKey(e)) => {
warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
// Return that we didn't find any requests smaller than the LSN, and logging the error.
return Ok(LsnForTimestamp::Past(min_lsn));
}
Err(e) => return Err(e),
};
if cmp {
high = mid;
@@ -643,6 +652,7 @@ impl Timeline {
low = mid + 1;
}
}
// If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
// so the LSN of the last commit record before or at `search_timestamp`.
// Remove one from `low` to get `t`.

View File

@@ -48,6 +48,7 @@ use timeline::compaction::GcCompactJob;
use timeline::compaction::ScheduledCompactionTask;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
@@ -2039,7 +2040,7 @@ impl Tenant {
) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline");
// We activate the timeline below manually, so this must be called on an active timeline.
// We activate the timeline below manually, so this must be called on an active tenant.
// We expect callers of this function to ensure this.
match self.current_state() {
TenantState::Activating { .. }
@@ -3100,9 +3101,17 @@ impl Tenant {
};
has_pending_task |= pending_task_left.unwrap_or(false);
if pending_task_left == Some(false) && *can_offload {
offload_timeline(self, timeline)
pausable_failpoint!("before-timeline-auto-offload");
match offload_timeline(self, timeline)
.instrument(info_span!("offload_timeline", %timeline_id))
.await?;
.await
{
Err(OffloadError::NotArchived) => {
// Ignore this, we likely raced with unarchival
Ok(())
}
other => other,
}?;
}
}

View File

@@ -304,6 +304,15 @@ pub enum WaitCompletionError {
#[derive(Debug, thiserror::Error)]
#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
pub struct UploadQueueNotReadyError;
#[derive(Debug, thiserror::Error)]
pub enum ShutdownIfArchivedError {
#[error(transparent)]
NotInitialized(NotInitialized),
#[error("timeline is not archived")]
NotArchived,
}
/// Behavioral modes that enable seamless live migration.
///
/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
@@ -816,6 +825,55 @@ impl RemoteTimelineClient {
Ok(need_wait)
}
/// Shuts the timeline client down, but only if the timeline is archived.
///
/// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
/// same lock to prevent races between unarchival and offloading: unarchival requires the
/// upload queue to be initialized, and leaves behind an upload queue where either dirty
/// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
/// queue.
pub(crate) async fn shutdown_if_archived(
self: &Arc<Self>,
) -> Result<(), ShutdownIfArchivedError> {
{
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard
.initialized_mut()
.map_err(ShutdownIfArchivedError::NotInitialized)?;
match (
upload_queue.dirty.archived_at.is_none(),
upload_queue.clean.0.archived_at.is_none(),
) {
// The expected case: the timeline is archived and we don't want to unarchive
(false, false) => {}
(true, false) => {
tracing::info!("can't shut down timeline: timeline slated for unarchival");
return Err(ShutdownIfArchivedError::NotArchived);
}
(dirty_archived, true) => {
tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
return Err(ShutdownIfArchivedError::NotArchived);
}
}
// Set the shutting_down flag while the guard from the archival check is held.
// This prevents a race with unarchival, as initialized_mut will not return
// an upload queue from this point.
// Also launch the queued tasks like shutdown() does.
if !upload_queue.shutting_down {
upload_queue.shutting_down = true;
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
// this operation is not counted similar to Barrier
self.launch_queued_tasks(upload_queue);
}
}
self.shutdown().await;
Ok(())
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
self: &Arc<Self>,

View File

@@ -194,7 +194,9 @@ impl DeleteTimelineFlow {
super::debug_assert_current_span_has_tenant_and_timeline_id();
let allow_offloaded_children = false;
let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
let set_stopping = true;
let (timeline, mut guard) =
Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
guard.mark_in_progress()?;
@@ -334,6 +336,7 @@ impl DeleteTimelineFlow {
tenant: &Tenant,
timeline_id: TimelineId,
allow_offloaded_children: bool,
set_stopping: bool,
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -389,8 +392,10 @@ impl DeleteTimelineFlow {
}
};
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
if set_stopping {
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
}
}
Ok((timeline, delete_lock_guard))

View File

@@ -1,10 +1,11 @@
use std::sync::Arc;
use pageserver_api::models::TenantState;
use pageserver_api::models::{TenantState, TimelineState};
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
#[derive(thiserror::Error, Debug)]
@@ -36,28 +37,29 @@ pub(crate) async fn offload_timeline(
tracing::info!("offloading archived timeline");
let allow_offloaded_children = true;
let (timeline, guard) =
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let set_stopping = false;
let (timeline, guard) = DeleteTimelineFlow::prepare(
tenant,
timeline.timeline_id,
allow_offloaded_children,
set_stopping,
)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
tracing::error!("timeline already offloaded, but given timeline object");
return Ok(());
};
let is_archived = timeline.is_archived();
match is_archived {
Some(true) => (),
Some(false) => {
tracing::warn!("tried offloading a non-archived timeline");
return Err(OffloadError::NotArchived);
}
None => {
// This is legal: calls to this function can race with the timeline shutting down
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
return Err(OffloadError::Cancelled);
match timeline.remote_client.shutdown_if_archived().await {
Ok(()) => {}
Err(ShutdownIfArchivedError::NotInitialized(_)) => {
// Either the timeline is being deleted, the operation is being retried, or we are shutting down.
// Don't return cancelled here to keep it idempotent.
}
Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
}
timeline.set_state(TimelineState::Stopping);
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Reload).await;

View File

@@ -319,27 +319,11 @@ pub(super) async fn handle_walreceiver_connection(
return Ok(());
}
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
let status_update = match replication_message {
ReplicationMessage::RawInterpretedWalRecords(raw) => {
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
// This is the end LSN of the raw WAL from which the records
// were interpreted.
@@ -380,31 +364,23 @@ pub(super) async fn handle_walreceiver_connection(
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
{
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
modification.commit(&ctx).await?;
uncommitted_records = 0;
}
let local_next_record_lsn = interpreted.next_record_lsn;
let ingested = walingest
if interpreted.is_observed() {
WAL_INGEST.records_observed.inc();
}
walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
.with_context(|| {
format!("could not ingest record at {local_next_record_lsn}")
})?;
if !ingested {
tracing::debug!(
"ingest: filtered out record @ LSN {local_next_record_lsn}"
);
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
uncommitted_records += 1;
// FIXME: this cannot be made pausable_failpoint without fixing the
@@ -418,13 +394,8 @@ pub(super) async fn handle_walreceiver_connection(
|| modification.approx_pending_bytes()
> DatadirModification::MAX_PENDING_BYTES
{
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
modification.commit(&ctx).await?;
uncommitted_records = 0;
}
}
@@ -442,13 +413,7 @@ pub(super) async fn handle_walreceiver_connection(
if uncommitted_records > 0 || needs_last_record_lsn_advance {
// Commit any uncommitted records
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
modification.commit(&ctx).await?;
}
if !caught_up && streaming_lsn >= end_of_wal {
@@ -469,6 +434,21 @@ pub(super) async fn handle_walreceiver_connection(
}
ReplicationMessage::XLogData(xlog_data) => {
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
// Pass the WAL data to the decoder, and see if we can decode
// more records as a result.
let data = xlog_data.data();

View File

@@ -556,6 +556,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
switch (neon_protocol_version)
{
case 3:
pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline);
break;
case 2:
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
break;
@@ -1135,9 +1138,9 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
2, /* min */
2, /* max */
2, /* use protocol version 2 */
2, /* min */
3, /* max */
PGC_SU_BACKEND,
0, /* no flags required */
NULL, NULL, NULL);

View File

@@ -44,10 +44,15 @@ typedef enum
T_NeonGetSlruSegmentResponse,
} NeonMessageTag;
typedef uint64 NeonRequestId;
/* base struct for c-style inheritance */
typedef struct
{
NeonMessageTag tag;
NeonRequestId reqid;
XLogRecPtr lsn;
XLogRecPtr not_modified_since;
} NeonMessage;
#define messageTag(m) (((const NeonMessage *)(m))->tag)
@@ -67,6 +72,7 @@ typedef enum {
SLRU_MULTIXACT_OFFSETS
} SlruKind;
/*--
* supertype of all the Neon*Request structs below.
*
@@ -87,37 +93,37 @@ typedef enum {
*
* These structs describe the V2 of these requests. (The old now-defunct V1
* protocol contained just one LSN and a boolean 'latest' flag.)
*
* V3 version of protocol adds request ID to all requests. This request ID is also included in response
* as well as other fields from requests, which allows to verify that we receive response for our request.
* We copy fields from request to response to make checking more reliable: request ID is formed from process ID
* and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
*/
typedef struct
{
NeonMessageTag tag;
XLogRecPtr lsn;
XLogRecPtr not_modified_since;
} NeonRequest;
typedef NeonMessage NeonRequest;
typedef struct
{
NeonRequest req;
NeonRequest hdr;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonExistsRequest;
typedef struct
{
NeonRequest req;
NeonRequest hdr;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonNblocksRequest;
typedef struct
{
NeonRequest req;
NeonRequest hdr;
Oid dbNode;
} NeonDbSizeRequest;
typedef struct
{
NeonRequest req;
NeonRequest hdr;
NRelFileInfo rinfo;
ForkNumber forknum;
BlockNumber blkno;
@@ -125,32 +131,29 @@ typedef struct
typedef struct
{
NeonRequest req;
SlruKind kind;
int segno;
NeonRequest hdr;
SlruKind kind;
int segno;
} NeonGetSlruSegmentRequest;
/* supertype of all the Neon*Response structs below */
typedef struct
{
NeonMessageTag tag;
} NeonResponse;
typedef NeonMessage NeonResponse;
typedef struct
{
NeonMessageTag tag;
NeonExistsRequest req;
bool exists;
} NeonExistsResponse;
typedef struct
{
NeonMessageTag tag;
NeonNblocksRequest req;
uint32 n_blocks;
} NeonNblocksResponse;
typedef struct
{
NeonMessageTag tag;
NeonGetPageRequest req;
char page[FLEXIBLE_ARRAY_MEMBER];
} NeonGetPageResponse;
@@ -158,21 +161,21 @@ typedef struct
typedef struct
{
NeonMessageTag tag;
NeonDbSizeRequest req;
int64 db_size;
} NeonDbSizeResponse;
typedef struct
{
NeonMessageTag tag;
NeonResponse req;
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
* message */
} NeonErrorResponse;
typedef struct
{
NeonMessageTag tag;
int n_blocks;
NeonGetSlruSegmentRequest req;
int n_blocks;
char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
} NeonGetSlruSegmentResponse;

View File

@@ -120,6 +120,9 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block
static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
static uint32 local_request_counter;
#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
/*
* Prefetch implementation:
*
@@ -188,15 +191,11 @@ typedef struct PrefetchRequest
uint8 status; /* see PrefetchStatus for valid values */
uint8 flags; /* see PrefetchRequestFlags */
neon_request_lsns request_lsns;
NeonRequestId reqid;
NeonResponse *response; /* may be null */
uint64 my_ring_index;
} PrefetchRequest;
StaticAssertDecl(sizeof(PrefetchRequest) == 64,
"We prefer to have a power-of-2 size for this struct. Please"
" try to find an alternative solution before reaching to"
" increase the expected size here");
/* prefetch buffer lookup hash table */
typedef struct PrfHashEntry
@@ -365,6 +364,7 @@ compact_prefetch_buffers(void)
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->reqid = source_slot->reqid;
target_slot->request_lsns = source_slot->request_lsns;
target_slot->my_ring_index = empty_ring_index;
@@ -798,7 +798,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.hdr.tag = T_NeonGetPageRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
/* lsn and not_modified_since are filled in below */
.rinfo = BufTagGetNRelFileInfo(slot->buftag),
.forknum = slot->buftag.forkNum,
@@ -807,14 +808,16 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
Assert(mySlotNo == MyPState->ring_unused);
slot->reqid = request.hdr.reqid;
if (force_request_lsns)
slot->request_lsns = *force_request_lsns;
else
neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum, slot->buftag.blockNum,
&slot->request_lsns, 1, NULL);
request.req.lsn = slot->request_lsns.request_lsn;
request.req.not_modified_since = slot->request_lsns.not_modified_since;
request.hdr.lsn = slot->request_lsns.request_lsn;
request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -1102,6 +1105,12 @@ Retry:
return min_ring_index;
}
static bool
equal_requests(NeonRequest* a, NeonRequest* b)
{
return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
}
/*
* Note: this function can get canceled and use a long jump to the next catch
@@ -1184,6 +1193,10 @@ nm_pack_request(NeonRequest *msg)
initStringInfo(&s);
pq_sendbyte(&s, msg->tag);
if (neon_protocol_version >= 3)
{
pq_sendint64(&s, msg->reqid);
}
pq_sendint64(&s, msg->lsn);
pq_sendint64(&s, msg->not_modified_since);
@@ -1261,8 +1274,16 @@ NeonResponse *
nm_unpack_response(StringInfo s)
{
NeonMessageTag tag = pq_getmsgbyte(s);
NeonResponse resp_hdr = {0}; /* make valgrind happy */
NeonResponse *resp = NULL;
resp_hdr.tag = tag;
if (neon_protocol_version >= 3)
{
resp_hdr.reqid = pq_getmsgint64(s);
resp_hdr.lsn = pq_getmsgint64(s);
resp_hdr.not_modified_since = pq_getmsgint64(s);
}
switch (tag)
{
/* pagestore -> pagestore_client */
@@ -1270,7 +1291,14 @@ nm_unpack_response(StringInfo s)
{
NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
msg_resp->tag = tag;
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->exists = pq_getmsgbyte(s);
pq_getmsgend(s);
@@ -1282,7 +1310,14 @@ nm_unpack_response(StringInfo s)
{
NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
msg_resp->tag = tag;
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->n_blocks = pq_getmsgint(s, 4);
pq_getmsgend(s);
@@ -1295,12 +1330,20 @@ nm_unpack_response(StringInfo s)
NeonGetPageResponse *msg_resp;
msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
msg_resp->tag = tag;
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
msg_resp->req.blkno = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
/* XXX: should be varlena */
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
pq_getmsgend(s);
Assert(msg_resp->tag == T_NeonGetPageResponse);
Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
resp = (NeonResponse *) msg_resp;
break;
@@ -1310,7 +1353,11 @@ nm_unpack_response(StringInfo s)
{
NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
msg_resp->tag = tag;
if (neon_protocol_version >= 3)
{
msg_resp->req.dbNode = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->db_size = pq_getmsgint64(s);
pq_getmsgend(s);
@@ -1328,7 +1375,7 @@ nm_unpack_response(StringInfo s)
msglen = strlen(msgtext);
msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
msg_resp->tag = tag;
msg_resp->req = resp_hdr;
memcpy(msg_resp->message, msgtext, msglen + 1);
pq_getmsgend(s);
@@ -1339,9 +1386,17 @@ nm_unpack_response(StringInfo s)
case T_NeonGetSlruSegmentResponse:
{
NeonGetSlruSegmentResponse *msg_resp;
int n_blocks = pq_getmsgint(s, 4);
msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
msg_resp->tag = tag;
int n_blocks;
msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
if (neon_protocol_version >= 3)
{
msg_resp->req.kind = pq_getmsgbyte(s);
msg_resp->req.segno = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
n_blocks = pq_getmsgint(s, 4);
msg_resp->n_blocks = n_blocks;
memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
pq_getmsgend(s);
@@ -1386,8 +1441,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1399,8 +1454,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1413,8 +1468,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1424,8 +1479,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1436,8 +1491,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -2312,39 +2367,64 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
{
NeonExistsRequest request = {
.req.tag = T_NeonExistsRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.hdr.tag = T_NeonExistsRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forkNum
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonExistsResponse:
{
NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
exists_resp->req.forknum != request.forknum)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
}
}
exists = exists_resp->exists;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
resp->reqid,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
}
pfree(resp);
}
switch (resp->tag)
{
case T_NeonExistsResponse:
exists = ((NeonExistsResponse *) resp)->exists;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
}
pfree(resp);
return exists;
}
@@ -2952,15 +3032,43 @@ Retry:
switch (resp->tag)
{
case T_NeonGetPageResponse:
memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
{
NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
if (neon_protocol_version >= 3)
{
if (resp->reqid != slot->reqid ||
resp->lsn != slot->request_lsns.request_lsn ||
resp->not_modified_since != slot->request_lsns.not_modified_since ||
!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
getpage_resp->req.forknum != forkNum ||
getpage_resp->req.blkno != base_blockno + i)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
}
}
memcpy(buffer, getpage_resp->page, BLCKSZ);
lfc_write(rinfo, forkNum, blockno, buffer);
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (resp->reqid != slot->reqid ||
resp->lsn != slot->request_lsns.request_lsn ||
resp->not_modified_since != slot->request_lsns.not_modified_since)
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, blockno, RelFileInfoFmt(rinfo),
errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
@@ -3443,47 +3551,72 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
{
NeonNblocksRequest request = {
.req.tag = T_NeonNblocksRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.hdr.tag = T_NeonNblocksRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forknum,
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonNblocksResponse:
{
NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
relsize_resp->req.forknum != forknum)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
}
}
n_blocks = relsize_resp->n_blocks;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
resp->reqid,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
n_blocks);
pfree(resp);
}
switch (resp->tag)
{
case T_NeonNblocksResponse:
n_blocks = ((NeonNblocksResponse *) resp)->n_blocks;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
n_blocks);
pfree(resp);
return n_blocks;
}
@@ -3503,40 +3636,64 @@ neon_dbsize(Oid dbNode)
{
NeonDbSizeRequest request = {
.req.tag = T_NeonDbSizeRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.hdr.tag = T_NeonDbSizeRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.dbNode = dbNode,
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonDbSizeResponse:
{
NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
dbsize_resp->req.dbNode != dbNode)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
}
}
db_size = dbsize_resp->db_size;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
resp->reqid,
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
pfree(resp);
}
switch (resp->tag)
{
case T_NeonDbSizeResponse:
db_size = ((NeonDbSizeResponse *) resp)->db_size;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
pfree(resp);
return db_size;
}
@@ -3868,16 +4025,17 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
return -1;
request = (NeonGetSlruSegmentRequest) {
.req.tag = T_NeonGetSlruSegmentRequest,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.hdr.tag = T_NeonGetSlruSegmentRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsn,
.hdr.not_modified_since = not_modified_since,
.kind = kind,
.segno = segno
};
do
{
while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
consume_prefetch_responses();
@@ -3887,14 +4045,38 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
switch (resp->tag)
{
case T_NeonGetSlruSegmentResponse:
n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
{
NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
slru_resp->req.kind != kind ||
slru_resp->req.segno != segno)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
}
}
n_blocks = slru_resp->n_blocks;
memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
resp->reqid,
kind,
segno,
LSN_FORMAT_ARGS(request_lsn)),
@@ -4033,8 +4215,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
NeonResponse *response;
NeonNblocksResponse *nbresponse;
NeonNblocksRequest request = {
.req = (NeonRequest) {
.hdr = (NeonRequest) {
.tag = T_NeonNblocksRequest,
.reqid = GENERATE_REQUEST_ID(),
.lsn = end_recptr,
.not_modified_since = end_recptr,
},

8
poetry.lock generated
View File

@@ -2028,13 +2028,13 @@ openapi-schema-validator = ">=0.4.2,<0.5.0"
[[package]]
name = "packaging"
version = "23.0"
version = "24.2"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
{file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
{file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
]
[[package]]

View File

@@ -1,3 +1,5 @@
use std::fmt;
use async_trait::async_trait;
use postgres_client::config::SslMode;
use pq_proto::BeMessage as Be;
@@ -5,15 +7,19 @@ use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, info_span};
use super::ComputeCredentialKeys;
use super::{ComputeCredentialKeys, ControlPlaneApi};
use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
use crate::auth::IpPattern;
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestContext;
use crate::control_plane::client::cplane_proxy_v1;
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::error::{ReportableError, UserFacingError};
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions;
use crate::stream::PqStream;
use crate::types::RoleName;
use crate::{auth, compute, waiters};
#[derive(Debug, Error)]
@@ -31,6 +37,13 @@ pub(crate) enum ConsoleRedirectError {
#[derive(Debug)]
pub struct ConsoleRedirectBackend {
console_uri: reqwest::Url,
api: cplane_proxy_v1::NeonControlPlaneClient,
}
impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "NeonControlPlaneClient")
}
}
impl UserFacingError for ConsoleRedirectError {
@@ -71,9 +84,24 @@ pub(crate) fn new_psql_session_id() -> String {
hex::encode(rand::random::<[u8; 8]>())
}
#[async_trait]
impl BackendIpAllowlist for ConsoleRedirectBackend {
async fn get_allowed_ips(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> auth::Result<Vec<auth::IpPattern>> {
self.api
.get_allowed_ips_and_secret(ctx, user_info)
.await
.map(|(ips, _)| ips.as_ref().clone())
.map_err(|e| e.into())
}
}
impl ConsoleRedirectBackend {
pub fn new(console_uri: reqwest::Url) -> Self {
Self { console_uri }
pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
Self { console_uri, api }
}
pub(crate) async fn authenticate(
@@ -81,10 +109,16 @@ impl ConsoleRedirectBackend {
ctx: &RequestContext,
auth_config: &'static AuthenticationConfig,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<(ConsoleRedirectNodeInfo, Option<Vec<IpPattern>>)> {
) -> auth::Result<(
ConsoleRedirectNodeInfo,
ComputeUserInfo,
Option<Vec<IpPattern>>,
)> {
authenticate(ctx, auth_config, &self.console_uri, client)
.await
.map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist))
.map(|(node_info, user_info, ip_allowlist)| {
(ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist)
})
}
}
@@ -109,7 +143,7 @@ async fn authenticate(
auth_config: &'static AuthenticationConfig,
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<(NodeInfo, Option<Vec<IpPattern>>)> {
) -> auth::Result<(NodeInfo, ComputeUserInfo, Option<Vec<IpPattern>>)> {
ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
// registering waiter can fail if we get unlucky with rng.
@@ -164,8 +198,15 @@ async fn authenticate(
let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
config.dbname(&db_info.dbname).user(&db_info.user);
let user: RoleName = db_info.user.into();
let user_info = ComputeUserInfo {
endpoint: db_info.aux.endpoint_id.as_str().into(),
user: user.clone(),
options: NeonOptions::default(),
};
ctx.set_dbname(db_info.dbname.into());
ctx.set_user(db_info.user.into());
ctx.set_user(user);
ctx.set_project(db_info.aux.clone());
info!("woken up a compute node");
@@ -188,6 +229,7 @@ async fn authenticate(
config,
aux: db_info.aux,
},
user_info,
db_info.allowed_ips,
))
}

View File

@@ -16,7 +16,9 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{debug, info, warn};
use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
use crate::auth::{
self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern,
};
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestContext;
@@ -131,7 +133,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint {
pub(crate) options: NeonOptions,
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub(crate) struct ComputeUserInfo {
pub(crate) endpoint: EndpointId,
pub(crate) user: RoleName,
@@ -244,6 +246,15 @@ impl AuthenticationConfig {
}
}
#[async_trait::async_trait]
pub(crate) trait BackendIpAllowlist {
async fn get_allowed_ips(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> auth::Result<Vec<auth::IpPattern>>;
}
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
///
@@ -256,7 +267,7 @@ async fn auth_quirks(
allow_cleartext: bool,
config: &'static AuthenticationConfig,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> auth::Result<ComputeCredentials> {
) -> auth::Result<(ComputeCredentials, Option<Vec<IpPattern>>)> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
@@ -315,7 +326,7 @@ async fn auth_quirks(
)
.await
{
Ok(keys) => Ok(keys),
Ok(keys) => Ok((keys, Some(allowed_ips.as_ref().clone()))),
Err(e) => {
if e.is_password_failed() {
// The password could have been changed, so we invalidate the cache.
@@ -385,7 +396,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
allow_cleartext: bool,
config: &'static AuthenticationConfig,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> auth::Result<Backend<'a, ComputeCredentials>> {
) -> auth::Result<(Backend<'a, ComputeCredentials>, Option<Vec<IpPattern>>)> {
let res = match self {
Self::ControlPlane(api, user_info) => {
debug!(
@@ -394,7 +405,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
"performing authentication using the console"
);
let credentials = auth_quirks(
let (credentials, ip_allowlist) = auth_quirks(
ctx,
&*api,
user_info,
@@ -404,7 +415,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
endpoint_rate_limiter,
)
.await?;
Backend::ControlPlane(api, credentials)
Ok((Backend::ControlPlane(api, credentials), ip_allowlist))
}
Self::Local(_) => {
return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
@@ -413,7 +424,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
// TODO: replace with some metric
info!("user successfully authenticated");
Ok(res)
res
}
}
@@ -441,6 +452,24 @@ impl Backend<'_, ComputeUserInfo> {
}
}
#[async_trait::async_trait]
impl BackendIpAllowlist for Backend<'_, ()> {
async fn get_allowed_ips(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> auth::Result<Vec<auth::IpPattern>> {
let auth_data = match self {
Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
};
auth_data
.map(|(ips, _)| ips.as_ref().clone())
.map_err(|e| e.into())
}
}
#[async_trait::async_trait]
impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
async fn wake_compute(
@@ -786,7 +815,7 @@ mod tests {
.await
.unwrap();
assert_eq!(creds.info.endpoint, "my-endpoint");
assert_eq!(creds.0.info.endpoint, "my-endpoint");
handle.await.unwrap();
}

View File

@@ -744,9 +744,59 @@ fn build_auth_backend(
}
AuthBackendType::ConsoleRedirect => {
let url = args.uri.parse()?;
let backend = ConsoleRedirectBackend::new(url);
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
let endpoint_cache_config: config::EndpointCacheConfig =
args.endpoint_cache_config.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!(
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
);
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
wake_compute_cache_config,
project_info_cache_config,
endpoint_cache_config,
)));
let config::ConcurrencyLockOptions {
shards,
limiter,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
"wake_compute_lock",
limiter,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)?));
let url = args.uri.clone().parse()?;
let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(ep_url, http::new_client());
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
// Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
// and locks are not used in ConsoleRedirectBackend,
// but they are required by the NeonControlPlaneClient
let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
endpoint,
args.control_plane_token.clone(),
caches,
locks,
wake_compute_endpoint_rate_limiter,
);
let backend = ConsoleRedirectBackend::new(url, api);
let config = Box::leak(Box::new(backend));
Ok(Either::Right(config))

View File

@@ -12,8 +12,10 @@ use tokio::sync::Mutex;
use tracing::{debug, info};
use uuid::Uuid;
use crate::auth::{check_peer_addr_is_in_list, IpPattern};
use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern};
use crate::config::ComputeConfig;
use crate::context::RequestContext;
use crate::error::ReportableError;
use crate::ext::LockExt;
use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
@@ -56,6 +58,9 @@ pub(crate) enum CancelError {
#[error("IP is not allowed")]
IpNotAllowed,
#[error("Authentication backend error")]
AuthError(#[from] AuthError),
}
impl ReportableError for CancelError {
@@ -68,6 +73,7 @@ impl ReportableError for CancelError {
CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
CancelError::IpNotAllowed => crate::error::ErrorKind::User,
CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
@@ -102,10 +108,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
}
}
/// Try to cancel a running query for the corresponding connection.
/// If the cancellation key is not found, it will be published to Redis.
/// check_allowed - if true, check if the IP is allowed to cancel the query
/// return Result primarily for tests
/// Cancelling only in notification, will be removed
pub(crate) async fn cancel_session(
&self,
key: CancelKeyData,
@@ -134,7 +137,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
}
// NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
let cancel_state = self.map.get(&key).and_then(|x| x.clone());
let Some(cancel_closure) = cancel_state else {
tracing::warn!("query cancellation key not found: {key}");
Metrics::get()
.proxy
@@ -185,6 +189,96 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
cancel_closure.try_cancel_query(self.compute_config).await
}
/// Try to cancel a running query for the corresponding connection.
/// If the cancellation key is not found, it will be published to Redis.
/// check_allowed - if true, check if the IP is allowed to cancel the query.
/// Will fetch IP allowlist internally.
///
/// return Result primarily for tests
pub(crate) async fn cancel_session_auth<T: BackendIpAllowlist>(
&self,
key: CancelKeyData,
ctx: RequestContext,
check_allowed: bool,
auth_backend: &T,
) -> Result<(), CancelError> {
// TODO: check for unspecified address is only for backward compatibility, should be removed
if !ctx.peer_addr().is_unspecified() {
let subnet_key = match ctx.peer_addr() {
IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
};
if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
// log only the subnet part of the IP address to know which subnet is rate limited
tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
});
return Err(CancelError::RateLimit);
}
}
// NB: we should immediately release the lock after cloning the token.
let cancel_state = self.map.get(&key).and_then(|x| x.clone());
let Some(cancel_closure) = cancel_state else {
tracing::warn!("query cancellation key not found: {key}");
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::NotFound,
});
if ctx.session_id() == Uuid::nil() {
// was already published, do not publish it again
return Ok(());
}
match self
.client
.try_publish(key, ctx.session_id(), ctx.peer_addr())
.await
{
Ok(()) => {} // do nothing
Err(e) => {
// log it here since cancel_session could be spawned in a task
tracing::error!("failed to publish cancellation key: {key}, error: {e}");
return Err(CancelError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
)));
}
}
return Ok(());
};
let ip_allowlist = auth_backend
.get_allowed_ips(&ctx, &cancel_closure.user_info)
.await
.map_err(CancelError::AuthError)?;
if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
// log it here since cancel_session could be spawned in a task
tracing::warn!("IP is not allowed to cancel the query: {key}");
return Err(CancelError::IpNotAllowed);
}
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::Found,
});
info!("cancelling query per user's request using key {key}");
cancel_closure.try_cancel_query(self.compute_config).await
}
#[cfg(test)]
fn contains(&self, session: &Session<P>) -> bool {
self.map.contains_key(&session.key)
@@ -248,6 +342,7 @@ pub struct CancelClosure {
cancel_token: CancelToken,
ip_allowlist: Vec<IpPattern>,
hostname: String, // for pg_sni router
user_info: ComputeUserInfo,
}
impl CancelClosure {
@@ -256,12 +351,14 @@ impl CancelClosure {
cancel_token: CancelToken,
ip_allowlist: Vec<IpPattern>,
hostname: String,
user_info: ComputeUserInfo,
) -> Self {
Self {
socket_addr,
cancel_token,
ip_allowlist,
hostname,
user_info,
}
}
/// Cancels the query running on user's compute node.
@@ -288,6 +385,8 @@ impl CancelClosure {
debug!("query was cancelled");
Ok(())
}
/// Obsolete (will be removed after moving CancelMap to Redis), only for notifications
pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
self.ip_allowlist = ip_allowlist;
}

View File

@@ -13,6 +13,7 @@ use thiserror::Error;
use tokio::net::TcpStream;
use tracing::{debug, error, info, warn};
use crate::auth::backend::ComputeUserInfo;
use crate::auth::parse_endpoint_param;
use crate::cancellation::CancelClosure;
use crate::config::ComputeConfig;
@@ -250,6 +251,7 @@ impl ConnCfg {
ctx: &RequestContext,
aux: MetricsAuxInfo,
config: &ComputeConfig,
user_info: ComputeUserInfo,
) -> Result<PostgresConnection, ConnectionError> {
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?;
@@ -294,8 +296,9 @@ impl ConnCfg {
process_id,
secret_key,
},
vec![],
vec![], // TODO: deprecated, will be removed
host.to_string(),
user_info,
);
let connection = PostgresConnection {

View File

@@ -159,6 +159,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let request_gauge = metrics.connection_requests.guard(proto);
let tls = config.tls_config.as_ref();
let record_handshake_error = !ctx.has_private_peer_addr();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
@@ -171,23 +172,20 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
let ctx = ctx.clone();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
cancel_span.follows_from(tracing::Span::current());
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.instrument(cancel_span)
.await,
);
}
cancellation_handler_clone
.cancel_session_auth(
cancel_key_data,
ctx,
config.authentication_config.ip_allowlist_check_enabled,
backend,
)
.await
.inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
}.instrument(cancel_span)
});
return Ok(None);
@@ -197,7 +195,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
ctx.set_db_options(params.clone());
let (user_info, ip_allowlist) = match backend
let (node_info, user_info, ip_allowlist) = match backend
.authenticate(ctx, &config.authentication_config, &mut stream)
.await
{
@@ -210,11 +208,12 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let mut node = connect_to_compute(
ctx,
&TcpMechanism {
user_info,
params_compat: true,
params: &params,
locks: &config.connect_compute_locks,
},
&user_info,
&node_info,
config.wake_compute_retry_config,
&config.connect_to_compute,
)

View File

@@ -29,7 +29,7 @@ use crate::rate_limiter::WakeComputeRateLimiter;
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, http, scram};
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
#[derive(Clone)]
pub struct NeonControlPlaneClient {
@@ -78,15 +78,30 @@ impl NeonControlPlaneClient {
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
}
let request_id = ctx.session_id().to_string();
let application_name = ctx.console_application_name();
self.do_get_auth_req(user_info, &ctx.session_id(), Some(ctx))
.await
}
async fn do_get_auth_req(
&self,
user_info: &ComputeUserInfo,
session_id: &uuid::Uuid,
ctx: Option<&RequestContext>,
) -> Result<AuthInfo, GetAuthInfoError> {
let request_id: String = session_id.to_string();
let application_name = if let Some(ctx) = ctx {
ctx.console_application_name()
} else {
"auth_cancellation".to_string()
};
async {
let request = self
.endpoint
.get_path("get_endpoint_access_control")
.header(X_REQUEST_ID, &request_id)
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())])
.query(&[("session_id", session_id)])
.query(&[
("application_name", application_name.as_str()),
("endpointish", user_info.endpoint.as_str()),
@@ -96,9 +111,16 @@ impl NeonControlPlaneClient {
debug!(url = request.url().as_str(), "sending http request");
let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
let response = self.endpoint.execute(request).await?;
drop(pause);
let response = match ctx {
Some(ctx) => {
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
let rsp = self.endpoint.execute(request).await;
drop(pause);
rsp?
}
None => self.endpoint.execute(request).await?,
};
info!(duration = ?start.elapsed(), "received http response");
let body = match parse_body::<GetEndpointAccessControl>(response).await {
Ok(body) => body,

View File

@@ -74,8 +74,11 @@ impl NodeInfo {
&self,
ctx: &RequestContext,
config: &ComputeConfig,
user_info: ComputeUserInfo,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config.connect(ctx, self.aux.clone(), config).await
self.config
.connect(ctx, self.aux.clone(), config, user_info)
.await
}
pub(crate) fn reuse_settings(&mut self, other: Self) {

View File

@@ -4,7 +4,7 @@ use tokio::time;
use tracing::{debug, info, warn};
use super::retry::ShouldRetryWakeCompute;
use crate::auth::backend::ComputeCredentialKeys;
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
use crate::config::{ComputeConfig, RetryConfig};
use crate::context::RequestContext;
@@ -71,6 +71,8 @@ pub(crate) struct TcpMechanism<'a> {
/// connect_to_compute concurrency lock
pub(crate) locks: &'static ApiLocks<Host>,
pub(crate) user_info: ComputeUserInfo,
}
#[async_trait]
@@ -88,7 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
) -> Result<PostgresConnection, Self::Error> {
let host = node_info.config.get_host();
let permit = self.locks.get_permit(&host).await?;
permit.release_result(node_info.connect(ctx, config).await)
permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await)
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {

View File

@@ -273,23 +273,20 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
let ctx = ctx.clone();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
cancel_span.follows_from(tracing::Span::current());
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.instrument(cancel_span)
.await,
);
}
cancellation_handler_clone
.cancel_session_auth(
cancel_key_data,
ctx,
config.authentication_config.ip_allowlist_check_enabled,
auth_backend,
)
.await
.inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
}.instrument(cancel_span)
});
return Ok(None);
@@ -315,7 +312,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
};
let user = user_info.get_user().to_owned();
let user_info = match user_info
let (user_info, ip_allowlist) = match user_info
.authenticate(
ctx,
&mut stream,
@@ -335,16 +332,19 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
}
};
let params_compat = match &user_info {
auth::Backend::ControlPlane(_, info) => {
info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some()
}
auth::Backend::Local(_) => false,
let compute_user_info = match &user_info {
auth::Backend::ControlPlane(_, info) => &info.info,
auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"),
};
let params_compat = compute_user_info
.options
.get(NeonOptions::PARAMS_COMPAT)
.is_some();
let mut node = connect_to_compute(
ctx,
&TcpMechanism {
user_info: compute_user_info.clone(),
params_compat,
params: &params,
locks: &config.connect_compute_locks,
@@ -356,6 +356,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
.or_else(|e| stream.throw_error(e))
.await?;
node.cancel_closure
.set_ip_allowlist(ip_allowlist.unwrap_or_default());
let session = cancellation_handler.get_session();
prepare_client_connection(&node, &session, &mut stream).await?;

View File

@@ -37,7 +37,6 @@ struct NotificationHeader<'a> {
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
#[serde(tag = "topic", content = "data")]
// Message to contributors: Make sure to align these topic names with the list below.
pub(crate) enum Notification {
#[serde(
rename = "/allowed_ips_updated",
@@ -74,21 +73,13 @@ pub(crate) enum Notification {
PasswordUpdate { password_update: PasswordUpdate },
#[serde(rename = "/cancel_session")]
Cancel(CancelSession),
}
/// Returns true if the topic name given is a known topic that we can deserialize and action on.
/// Returns false otherwise.
fn known_topic(s: &str) -> bool {
// Message to contributors: Make sure to align these topic names with the enum above.
matches!(
s,
"/allowed_ips_updated"
| "/block_public_or_vpc_access_updated"
| "/allowed_vpc_endpoints_updated_for_org"
| "/allowed_vpc_endpoints_updated_for_projects"
| "/password_updated"
| "/cancel_session"
)
#[serde(
other,
deserialize_with = "deserialize_unknown_topic",
skip_serializing
)]
UnknownTopic,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -136,6 +127,15 @@ where
serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
}
// https://github.com/serde-rs/serde/issues/1714
fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error>
where
D: serde::Deserializer<'de>,
{
deserializer.deserialize_any(serde::de::IgnoredAny)?;
Ok(())
}
struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
cache: Arc<C>,
cancellation_handler: Arc<CancellationHandler<()>>,
@@ -178,32 +178,29 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
let payload: String = msg.get_payload()?;
tracing::debug!(?payload, "received a message payload");
// For better error handling, we first parse the payload to extract the topic.
// If there's a topic we don't support, we can handle that error more gracefully.
let header: NotificationHeader = match serde_json::from_str(&payload) {
Ok(msg) => msg,
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: msg.get_channel_name(),
});
tracing::error!("broken message: {e}");
let msg: Notification = match serde_json::from_str(&payload) {
Ok(Notification::UnknownTopic) => {
match serde_json::from_str::<NotificationHeader>(&payload) {
// don't update the metric for redis errors if it's just a topic we don't know about.
Ok(header) => tracing::warn!(topic = header.topic, "unknown topic"),
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: msg.get_channel_name(),
});
tracing::error!("broken message: {e}");
}
};
return Ok(());
}
};
if !known_topic(header.topic) {
// don't update the metric for redis errors if it's just a topic we don't know about.
tracing::warn!(topic = header.topic, "unknown topic");
return Ok(());
}
let msg: Notification = match serde_json::from_str(&payload) {
Ok(msg) => msg,
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: msg.get_channel_name(),
});
tracing::error!(topic = header.topic, "broken message: {e}");
match serde_json::from_str::<NotificationHeader>(&payload) {
Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
Err(_) => tracing::error!("broken message: {e}"),
};
return Ok(());
}
};
@@ -278,6 +275,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
invalidate_cache(cache, msg);
});
}
Notification::UnknownTopic => unreachable!(),
}
Ok(())
@@ -304,6 +303,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
// https://github.com/neondatabase/neon/pull/10073
}
Notification::UnknownTopic => unreachable!(),
}
}
@@ -471,4 +471,30 @@ mod tests {
Ok(())
}
#[test]
fn parse_unknown_topic() -> anyhow::Result<()> {
let with_data = json!({
"type": "message",
"topic": "/doesnotexist",
"data": {
"payload": "ignored"
},
"extra_fields": "something"
})
.to_string();
let result: Notification = serde_json::from_str(&with_data)?;
assert_eq!(result, Notification::UnknownTopic);
let without_data = json!({
"type": "message",
"topic": "/doesnotexist",
"extra_fields": "something"
})
.to_string();
let result: Notification = serde_json::from_str(&without_data)?;
assert_eq!(result, Notification::UnknownTopic);
Ok(())
}
}

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.83.0"
channel = "1.84.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -0,0 +1,41 @@
---- MODULE MCProposerAcceptorReconfig ----
EXTENDS TLC, ProposerAcceptorReconfig
\* Augments the spec with model checking constraints.
\* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it
\* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big
\* anyway.
\* For model checking.
CONSTANTS
max_entries, \* model constraint: max log entries acceptor/proposer can hold
max_term, \* model constraint: max allowed term
max_generation \* mode constraint: max config generation
ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat
\* Model space constraint.
StateConstraint == /\ \A p \in proposers:
/\ prop_state[p].term <= max_term
/\ Len(prop_state[p].wal) <= max_entries
/\ conf_store.generation <= max_generation
\* Sets of proposers and acceptors and symmetric because we don't take any
\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
\* ...)
ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
\* enforce order of the vars in the error trace with ALIAS
\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
\* as of writing this.
Alias == [
prop_state |-> prop_state,
prop_conf |-> prop_conf,
acc_state |-> acc_state,
acc_conf |-> acc_conf,
committed |-> committed,
conf_store |-> conf_store
]
====

View File

@@ -3,6 +3,9 @@ EXTENDS TLC, ProposerAcceptorStatic
\* Augments the spec with model checking constraints.
\* Note that MCProposerAcceptorReconfig duplicates it and might need to
\* be updated as well.
\* For model checking.
CONSTANTS
max_entries, \* model constraint: max log entries acceptor/proposer can hold

View File

@@ -0,0 +1,350 @@
---- MODULE ProposerAcceptorReconfig ----
(*
Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md
Simplifications:
- The ones inherited from ProposerAcceptorStatic.
- We don't model transient state of the configuration change driver process
(storage controller in the implementation). Its actions StartChange and FinishChange
are taken based on the persistent state of safekeepers and conf store. The
justification for that is the following: once new configuration n is
created (e.g with StartChange or FinishChange), any old configuration
change driver working on older conf < n will never be able to commit
it to the conf store because it is protected by CAS. The
propagation of these older confs is still possible though, and
spec allows to do it through acceptors.
Plus the model is already pretty huge.
- Previous point also means that the FinishChange action is
based only on the current state of safekeepers, not from
the past. That's ok because while individual
acceptor <last_log_term, flush_lsn> may go down,
quorum one never does. So the FinishChange
condition which collects max of the quorum may get
only more strict over time.
The invariants expectedly break if any of FinishChange
required conditions are removed.
*)
EXTENDS Integers, Sequences, FiniteSets, TLC
VARIABLES
\* state which is the same in the static spec
prop_state,
acc_state,
committed,
elected_history,
\* reconfiguration only state
prop_conf, \* prop_conf[p] is current configuration of proposer p
acc_conf, \* acc_conf[a] is current configuration of acceptor a
conf_store \* configuration in the configuration store.
CONSTANT
acceptors,
proposers
CONSTANT NULL
\* Import ProposerAcceptorStatic under PAS.
\*
\* Note that all vars and consts are named the same and thus substituted
\* implicitly.
PAS == INSTANCE ProposerAcceptorStatic
\********************************************************************************
\* Helpers
\********************************************************************************
\********************************************************************************
\* Type assertion
\********************************************************************************
\* Is c a valid config?
IsConfig(c) ==
/\ DOMAIN c = {"generation", "members", "newMembers"}
\* Unique id of the configuration.
/\ c.generation \in Nat
/\ c.members \in SUBSET acceptors
\* newMembers is NULL when it is not a joint conf.
/\ \/ c.newMembers = NULL
\/ c.newMembers \in SUBSET acceptors
TypeOk ==
/\ PAS!TypeOk
/\ \A p \in proposers: IsConfig(prop_conf[p])
/\ \A a \in acceptors: IsConfig(acc_conf[a])
/\ IsConfig(conf_store)
\********************************************************************************
\* Initial
\********************************************************************************
Init ==
/\ PAS!Init
/\ \E init_members \in SUBSET acceptors:
LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN
\* refer to RestartProposer why it is not NULL
/\ prop_conf = [p \in proposers |-> init_conf]
/\ acc_conf = [a \in acceptors |-> init_conf]
/\ conf_store = init_conf
\* We could start with anything, but to reduce state space state with
\* the most reasonable total acceptors - 1 conf size, which e.g.
\* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2,
\* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in
\* the smallest models with single change.
/\ Cardinality(init_members) = Cardinality(acceptors) - 1
\********************************************************************************
\* Actions
\********************************************************************************
\* Proposer p loses all state, restarting. In the static spec we bump restarted
\* proposer term to max of some quorum + 1 which is a minimal term which can win
\* election. With reconfigurations it's harder to calculate such a term, so keep
\* it simple and take random acceptor one + 1.
\*
\* Also make proposer to adopt configuration of another random acceptor. In the
\* impl proposer starts with NULL configuration until handshake with first
\* acceptor. Removing this NULL special case makes the spec a bit simpler.
RestartProposer(p) ==
/\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1)
/\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]]
/\ UNCHANGED <<acc_conf, conf_store>>
\* Acceptor a immediately votes for proposer p.
Vote(p, a) ==
\* Configuration must be the same.
/\ prop_conf[p].generation = acc_conf[a].generation
\* And a is expected be a member of it. This is likely redundant as long as
\* becoming leader checks membership (though vote also contributes to max
\* <term, lsn> calculation).
/\ \/ a \in prop_conf[p].members
\/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
/\ PAS!Vote(p, a)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
\* Proposer p gets elected.
BecomeLeader(p) ==
/\ prop_state[p].state = "campaign"
\* Votes must form quorum in both sets (if the newMembers exists).
/\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members)
/\ \/ prop_conf[p].newMembers = NULL
\* TLA+ disjunction evaluation doesn't short-circuit for a good reason:
\* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ
\* so repeat the null check.
\/ (prop_conf[p].newMembers /= NULL) /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers))
\* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so
\* ensure its conf is still the same. In the impl WAL fetching also has to
\* check the configuration.
/\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation
/\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation
/\ PAS!DoBecomeLeader(p)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
UpdateTerm(p, a) ==
/\ PAS!UpdateTerm(p, a)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
TruncateWal(p, a) ==
/\ prop_state[p].state = "leader"
\* Configuration must be the same.
/\ prop_conf[p].generation = acc_conf[a].generation
/\ PAS!TruncateWal(p, a)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
NewEntry(p) ==
/\ PAS!NewEntry(p)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
AppendEntry(p, a) ==
/\ prop_state[p].state = "leader"
\* Configuration must be the same.
/\ prop_conf[p].generation = acc_conf[a].generation
\* And a is member of it. Ignoring this likely wouldn't hurt, but not useful
\* either.
/\ \/ a \in prop_conf[p].members
\/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
/\ PAS!AppendEntry(p, a)
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
\* see PAS!CommitEntries for comments.
CommitEntries(p) ==
/\ prop_state[p].state = "leader"
/\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members):
LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN
\* Configuration must be the same.
/\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
/\ q1_commit_lsn /= NULL
\* We must collect acks from both quorums, if newMembers is present.
/\ IF prop_conf[p].newMembers = NULL THEN
PAS!DoCommitEntries(p, q1_commit_lsn)
ELSE
\E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers):
LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN
\* Configuration must be the same.
/\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
/\ q2_commit_lsn /= NULL
/\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn))
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
\* Proposer p adopts higher conf c from conf store or from some acceptor.
ProposerSwitchConf(p) ==
/\ \E c \in ({conf_store} \union {acc_conf[a]: a \in acceptors}):
\* p's conf is lower than c.
/\ (c.generation > prop_conf[p].generation)
\* We allow to bump conf without restart only when wp is already elected.
\* If it isn't, the votes it has already collected are from the previous
\* configuration and can't be used.
\*
\* So if proposer is in 'campaign' in the impl we would restart preserving
\* conf and increasing term. In the spec this transition is already covered
\* by more a generic RestartProposer, so we don't specify it here.
/\ prop_state[p].state = "leader"
/\ prop_conf' = [prop_conf EXCEPT ![p] = c]
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, acc_conf, conf_store>>
\* Do CAS on the conf store, starting change into the new_members conf.
StartChange(new_members) ==
\* Possible only if we don't already have the change in progress.
/\ conf_store.newMembers = NULL
\* Not necessary, but reduces space a bit.
/\ new_members /= conf_store.members
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members]
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
\* Acceptor's last_log_term.
AccLastLogTerm(acc) ==
PAS!LastLogTerm(PAS!AcceptorTermHistory(acc))
\* Do CAS on the conf store, transferring joint conf into the newMembers only.
FinishChange ==
\* have joint conf
/\ conf_store.newMembers /= NULL
\* The conditions for finishing the change are:
/\ \E qo \in PAS!AllMinQuorums(conf_store.members):
\* 1) Old majority must be aware of the joint conf.
\* Note: generally the driver can't know current acceptor
\* generation, it can only know that it once had been the
\* expected one, but it might have advanced since then.
\* But as explained at the top of the file if acceptor gen
\* advanced, FinishChange will never be able to complete
\* due to CAS anyway. We use strict equality here because
\* that's what makes sense conceptually (old driver should
\* abandon its attempt if it observes that conf has advanced).
/\ \A a \in qo: conf_store.generation = acc_conf[a].generation
\* 2) New member set must have log synced, i.e. some its majority needs
\* to have <last_log_term, lsn> at least as high as max of some
\* old majority.
\* 3) Term must be synced, i.e. some majority of the new set must
\* have term >= than max term of some old majority.
\* This ensures that two leaders are never elected with the same
\* term even after config change (which would be bad unless we treat
\* generation as a part of term which we don't).
\* 4) A majority of the new set must be aware of the joint conf.
\* This allows to safely destoy acceptor state if it is not a
\* member of its current conf (which is useful for cleanup after
\* migration as well as for aborts).
/\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo})
sync_term == PAS!Maximum({acc_state[a].term: a \in qo})
IN
\E qn \in PAS!AllMinQuorums(conf_store.newMembers):
\A a \in qn:
/\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos)
/\ acc_state[a].term >= sync_term
\* The same note as above about strict equality applies here.
/\ conf_store.generation = acc_conf[a].generation
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL]
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
\* Do CAS on the conf store, aborting the change in progress.
AbortChange ==
\* have joint conf
/\ conf_store.newMembers /= NULL
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL]
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
\* Acceptor a switches to higher configuration from the conf store
\* or from some proposer.
AccSwitchConf(a) ==
/\ \E c \in ({conf_store} \union {prop_conf[p]: p \in proposers}):
/\ acc_conf[a].generation < c.generation
/\ acc_conf' = [acc_conf EXCEPT ![a] = c]
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, conf_store>>
\* Nuke all acceptor state if it is not a member of its current conf. Models
\* cleanup after migration/abort.
AccReset(a) ==
/\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members)
\/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers))
/\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc]
\* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark
\* that elected proposer performed TruncateWal on the acceptor, which isn't
\* true anymore after state reset. In the impl local deletion is expected to
\* terminate all existing connections.
/\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]]
/\ UNCHANGED <<committed, elected_history, prop_conf, acc_conf, conf_store>>
\*******************************************************************************
\* Final spec
\*******************************************************************************
Next ==
\/ \E p \in proposers: RestartProposer(p)
\/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
\/ \E p \in proposers: BecomeLeader(p)
\/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
\/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
\/ \E p \in proposers: NewEntry(p)
\/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
\/ \E p \in proposers: CommitEntries(p)
\/ \E new_members \in SUBSET acceptors: StartChange(new_members)
\/ FinishChange
\/ AbortChange
\/ \E p \in proposers: ProposerSwitchConf(p)
\/ \E a \in acceptors: AccSwitchConf(a)
\/ \E a \in acceptors: AccReset(a)
Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf, conf_store>>
\********************************************************************************
\* Invariants
\********************************************************************************
AllConfs ==
{conf_store} \union {prop_conf[p]: p \in proposers} \union {acc_conf[a]: a \in acceptors}
\* Fairly trivial (given the conf store) invariant that different configurations
\* with the same generation are never issued.
ConfigSafety ==
\A c1, c2 \in AllConfs:
(c1.generation = c2.generation) => (c1 = c2)
ElectionSafety == PAS!ElectionSafety
ElectionSafetyFull == PAS!ElectionSafetyFull
LogIsMonotonic == PAS!LogIsMonotonic
LogSafety == PAS!LogSafety
\********************************************************************************
\* Invariants which don't need to hold, but useful for playing/debugging.
\********************************************************************************
\* Check that we ever switch into non joint conf.
MaxAccConf == ~ \E a \in acceptors:
/\ acc_conf[a].generation = 3
/\ acc_conf[a].newMembers /= NULL
CommittedNotTruncated == PAS!CommittedNotTruncated
MaxTerm == PAS!MaxTerm
MaxStoreConf == conf_store.generation <= 1
MaxAccWalLen == PAS!MaxAccWalLen
MaxCommitLsn == PAS!MaxCommitLsn
====

View File

@@ -18,7 +18,7 @@
\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
\* Some ideas how to break it to play around to get a feeling:
\* - replace Quorums with BadQuorums.
\* - replace Quorum with BadQuorum.
\* - remove 'don't commit entries from previous terms separately' rule in
\* CommitEntries and observe figure 8 from the raft paper.
\* With p2a3t4l4 32 steps error was found in 1h on 80 cores.
@@ -69,16 +69,26 @@ Upsert(f, k, v, l(_)) ==
\*****************
NumAccs == Cardinality(acceptors)
\* Does set of acceptors `acc_set` form the quorum in the member set `members`?
\* Acceptors not from `members` are excluded (matters only for reconfig).
FormsQuorum(acc_set, members) ==
Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1)
\* does acc_set form the quorum?
Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
\* all quorums of acceptors
Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
\* Like FormsQuorum, but for minimal quorum.
FormsMinQuorum(acc_set, members) ==
Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1)
\* For substituting Quorums and seeing what happens.
BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2)
BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)}
\* All sets of acceptors forming minimal quorums in the member set `members`.
AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)}
AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)}
\* For substituting Quorum and seeing what happens.
FormsBadQuorum(acc_set, members) ==
Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2)
FormsMinBadQuorum(acc_set, members) ==
Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2)
AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)}
AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)}
\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
FlushLsn(a) == Len(acc_state[a].wal) + 1
@@ -135,10 +145,11 @@ TypeOk ==
/\ IsWal(prop_state[p].wal)
\* Map of acceptor -> next lsn to send. It is set when truncate_wal is
\* done so sending entries is allowed only after that. In the impl TCP
\* ensures this ordering.
\* ensures this ordering. We use NULL instead of missing value to use
\* EXCEPT in AccReset.
/\ \A a \in DOMAIN prop_state[p].nextSendLsn:
/\ a \in acceptors
/\ prop_state[p].nextSendLsn[a] \in Lsns
/\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL}
/\ \A a \in acceptors:
/\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
/\ acc_state[a].term \in Terms
@@ -167,6 +178,19 @@ TypeOk ==
\* Initial
\********************************************************************************
InitAcc ==
[
\* There will be no leader in zero term, 1 is the first
\* real.
term |-> 0,
\* Again, leader in term 0 doesn't exist, but we initialize
\* term histories with it to always have common point in
\* them. Lsn is 1 because TLA+ sequences are indexed from 1
\* (we don't want to truncate WAL out of range).
termHistory |-> << [term |-> 0, lsn |-> 1] >>,
wal |-> << >>
]
Init ==
/\ prop_state = [p \in proposers |-> [
state |-> "campaign",
@@ -174,19 +198,9 @@ Init ==
votes |-> EmptyF,
termHistory |-> << >>,
wal |-> << >>,
nextSendLsn |-> EmptyF
nextSendLsn |-> [a \in acceptors |-> NULL]
]]
/\ acc_state = [a \in acceptors |-> [
\* There will be no leader in zero term, 1 is the first
\* real.
term |-> 0,
\* Again, leader in term 0 doesn't exist, but we initialize
\* term histories with it to always have common point in
\* them. Lsn is 1 because TLA+ sequences are indexed from 1
\* (we don't want to truncate WAL out of range).
termHistory |-> << [term |-> 0, lsn |-> 1] >>,
wal |-> << >>
]]
/\ acc_state = [a \in acceptors |-> InitAcc]
/\ committed = {}
/\ elected_history = EmptyF
@@ -195,23 +209,35 @@ Init ==
\* Actions
\********************************************************************************
\* Proposer loses all state.
RestartProposerWithTerm(p, new_term) ==
/\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
![p].term = new_term,
![p].votes = EmptyF,
![p].termHistory = << >>,
![p].wal = << >>,
![p].nextSendLsn = [a \in acceptors |-> NULL]]
/\ UNCHANGED <<acc_state, committed, elected_history>>
\* Proposer p loses all state, restarting.
\* For simplicity (and to reduct state space), we assume it immediately gets
\* current state from quorum q of acceptors determining the term he will request
\* to vote for.
RestartProposer(p, q) ==
/\ Quorum(q)
/\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
/\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
![p].term = new_term,
![p].votes = EmptyF,
![p].termHistory = << >>,
![p].wal = << >>,
![p].nextSendLsn = EmptyF]
/\ UNCHANGED <<acc_state, committed, elected_history>>
RestartProposer(p) ==
\E q \in AllQuorums(acceptors):
LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
RestartProposerWithTerm(p, new_term)
\* Term history of acceptor a's WAL: the one saved truncated to contain only <=
\* local FlushLsn entries.
\* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry
\* (and begin LSN of the next). The mental model for non strict comparison is
\* that once proposer is elected it immediately writes log record with zero
\* length. This allows leader to commit existing log without writing any new
\* entries. For example, assume acceptor has WAL
\* 1.1, 1.2
\* written by prop with term 1; its current <last_log_term, flush_lsn>
\* is <1, 3>. Now prop with term 2 and max vote from this acc is elected.
\* Once TruncateWAL is done, <last_log_term, flush_lsn> becomes <2, 3>
\* without any new records explicitly written.
AcceptorTermHistory(a) ==
SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
@@ -230,35 +256,52 @@ Vote(p, a) ==
\* Get lastLogTerm from term history th.
LastLogTerm(th) == th[Len(th)].term
\* Compares <term, lsn> pairs: returns true if tl1 >= tl2.
TermLsnGE(tl1, tl2) ==
/\ tl1.term >= tl2.term
/\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn)
\* Choose max <term, lsn> pair in the non empty set of them.
MaxTermLsn(term_lsn_set) ==
CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl)
\* Find acceptor with the highest <last_log_term, lsn> vote in proposer p's votes.
MaxVoteAcc(p) ==
CHOOSE a \in DOMAIN prop_state[p].votes:
LET a_vote == prop_state[p].votes[a]
a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn]
vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)}
IN
a_vote_term_lsn = MaxTermLsn(vote_term_lsns)
\* Workhorse for BecomeLeader.
\* Assumes the check prop_state[p] votes is quorum has been done *outside*.
DoBecomeLeader(p) ==
LET
\* Find acceptor with the highest <last_log_term, lsn> vote.
max_vote_acc == MaxVoteAcc(p)
max_vote == prop_state[p].votes[max_vote_acc]
prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
IN
\* We copy all log preceding proposer's term from the max vote node so
\* make sure it is still on one term with us. This is a model
\* simplification which can be removed, in impl we fetch WAL on demand
\* from safekeeper which has it later. Note though that in case of on
\* demand fetch we must check on donor not only term match, but that
\* truncate_wal had already been done (if it is not max_vote_acc).
/\ acc_state[max_vote_acc].term = prop_state[p].term
/\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
![p].termHistory = prop_th,
![p].wal = acc_state[max_vote_acc].wal
]
/\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
/\ UNCHANGED <<acc_state, committed>>
\* Proposer p gets elected.
BecomeLeader(p) ==
/\ prop_state[p].state = "campaign"
/\ Quorum(DOMAIN prop_state[p].votes)
/\ LET
\* Find acceptor with the highest <last_log_term, lsn> vote.
max_vote_acc ==
CHOOSE a \in DOMAIN prop_state[p].votes:
LET v == prop_state[p].votes[a]
IN \A v2 \in Range(prop_state[p].votes):
/\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory)
/\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn)
max_vote == prop_state[p].votes[max_vote_acc]
prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
IN
\* We copy all log preceding proposer's term from the max vote node so
\* make sure it is still on one term with us. This is a model
\* simplification which can be removed, in impl we fetch WAL on demand
\* from safekeeper which has it later. Note though that in case of on
\* demand fetch we must check on donor not only term match, but that
\* truncate_wal had already been done (if it is not max_vote_acc).
/\ acc_state[max_vote_acc].term = prop_state[p].term
/\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
![p].termHistory = prop_th,
![p].wal = acc_state[max_vote_acc].wal
]
/\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
/\ UNCHANGED <<acc_state, committed>>
/\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors)
/\ DoBecomeLeader(p)
\* Acceptor a learns about elected proposer p's term. In impl it matches to
\* VoteRequest/VoteResponse exchange when leader is already elected and is not
@@ -287,10 +330,11 @@ FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
IN
[term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
\* Elected proposer p immediately truncates WAL (and term history) of acceptor a
\* before starting streaming. Establishes nextSendLsn for a.
\* Elected proposer p immediately truncates WAL (and sets term history) of
\* acceptor a before starting streaming. Establishes nextSendLsn for a.
\*
\* In impl this happens at each reconnection, here we also allow to do it multiple times.
\* In impl this happens at each reconnection, here we also allow to do it
\* multiple times.
TruncateWal(p, a) ==
/\ prop_state[p].state = "leader"
/\ acc_state[a].term = prop_state[p].term
@@ -321,8 +365,8 @@ NewEntry(p) ==
AppendEntry(p, a) ==
/\ prop_state[p].state = "leader"
/\ acc_state[a].term = prop_state[p].term
/\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal
/\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
/\ prop_state[p].nextSendLsn[a] /= NULL \* did TruncateWal
/\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
/\ LET
send_lsn == prop_state[p].nextSendLsn[a]
entry == prop_state[p].wal[send_lsn]
@@ -337,41 +381,65 @@ AppendEntry(p, a) ==
PropStartLsn(p) ==
IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
\* Proposer p commits all entries it can using quorum q. Note that unlike
\* will62794/logless-reconfig this allows to commit entries from previous terms
\* (when conditions for that are met).
CommitEntries(p, q) ==
/\ prop_state[p].state = "leader"
/\ \A a \in q:
\* LSN which can be committed by proposer p using min quorum q (check that q
\* forms quorum must have been done outside). NULL if there is none.
QuorumCommitLsn(p, q) ==
IF
/\ prop_state[p].state = "leader"
/\ \A a \in q:
\* Without explicit responses to appends this ensures that append
\* up to FlushLsn has been accepted.
/\ acc_state[a].term = prop_state[p].term
\* nextSendLsn existence means TruncateWal has happened, it ensures
\* acceptor's WAL (and FlushLsn) are from proper proposer's history.
\* Alternatively we could compare LastLogTerm here, but that's closer to
\* what we do in the impl (we check flushLsn in AppendResponse, but
\* AppendRequest is processed only if HandleElected handling was good).
/\ a \in DOMAIN prop_state[p].nextSendLsn
\* Now find the LSN present on all the quorum.
/\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
\* This is the basic Raft rule of not committing entries from previous
\* terms except along with current term entry (commit them only when
\* quorum recovers, i.e. last_log_term on it reaches leader's term).
/\ quorum_lsn >= PropStartLsn(p)
/\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)}
/\ UNCHANGED <<prop_state, acc_state, elected_history>>
/\ prop_state[p].nextSendLsn[a] /= NULL
THEN
\* Now find the LSN present on all the quorum.
LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
\* This is the basic Raft rule of not committing entries from previous
\* terms except along with current term entry (commit them only when
\* quorum recovers, i.e. last_log_term on it reaches leader's term).
IF quorum_lsn >= PropStartLsn(p) THEN
quorum_lsn
ELSE
NULL
ELSE
NULL
\* Commit all entries on proposer p with record lsn < commit_lsn.
DoCommitEntries(p, commit_lsn) ==
/\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)}
/\ UNCHANGED <<prop_state, acc_state, elected_history>>
\* Proposer p commits all entries it can using some quorum. Note that unlike
\* will62794/logless-reconfig this allows to commit entries from previous terms
\* (when conditions for that are met).
CommitEntries(p) ==
/\ prop_state[p].state = "leader"
\* Using min quorums here is better because 1) QuorumCommitLsn for
\* simplicity checks min across all accs in q. 2) it probably makes
\* evaluation faster.
/\ \E q \in AllMinQuorums(acceptors):
LET commit_lsn == QuorumCommitLsn(p, q) IN
/\ commit_lsn /= NULL
/\ DoCommitEntries(p, commit_lsn)
\*******************************************************************************
\* Final spec
\*******************************************************************************
Next ==
\/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
\/ \E p \in proposers: RestartProposer(p)
\/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
\/ \E p \in proposers: BecomeLeader(p)
\/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
\/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
\/ \E p \in proposers: NewEntry(p)
\/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
\/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q)
\/ \E p \in proposers: CommitEntries(p)
Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>

View File

@@ -2,6 +2,7 @@
# Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
# ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla
CONFIG=$1
SPEC=$2
@@ -12,6 +13,7 @@ mkdir -p "tlc-results"
CONFIG_FILE=$(basename -- "$CONFIG")
outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
outfile="tlc-results/$outfilename"
echo "saving results to $outfile"
touch $outfile
# Save some info about the run.
@@ -45,5 +47,6 @@ echo "" >> $outfile
# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
#
# Add -simulate to run in infinite simulation mode.
# -coverage 1 is useful for profiling (check how many times actions are taken).
java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
-cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile

View File

@@ -0,0 +1,21 @@
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2}
max_term = 2
max_entries = 2
max_generation = 3
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ConfigSafety
ElectionSafetyFull
LogIsMonotonic
LogSafety
\* As its comment explains generally it is not expected to hold, but
\* in such small model it is true.
CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias

View File

@@ -0,0 +1,19 @@
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2}
max_term = 2
max_entries = 2
max_generation = 5
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ConfigSafety
ElectionSafetyFull
LogIsMonotonic
LogSafety
CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias

View File

@@ -0,0 +1,20 @@
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2, a3}
max_term = 2
max_entries = 2
max_generation = 3
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ConfigSafety
ElectionSafetyFull
LogIsMonotonic
LogSafety
CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias

View File

@@ -0,0 +1,19 @@
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2, a3, a4}
max_term = 2
max_entries = 2
max_generation = 3
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ElectionSafetyFull
LogIsMonotonic
LogSafety
CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias

View File

@@ -0,0 +1,25 @@
# Print all lines, but thin out lines starting with Progress:
# leave only first and last 5 ones in the beginning, and only 1 of 1440
# of others (once a day).
# Also remove checkpointing logs.
{
lines[NR] = $0
}
$0 ~ /^Progress/ {
++pcount
}
END {
progress_idx = 0
for (i = 1; i <= NR; i++) {
if (lines[i] ~ /^Progress/) {
if (progress_idx < 5 || progress_idx >= pcount - 5 || progress_idx % 1440 == 0) {
print lines[i]
}
progress_idx++
}
else if (lines[i] ~ /^Checkpointing/) {}
else {
print lines[i]
}
}
}

View File

@@ -0,0 +1,3 @@
#!/bin/bash
awk -f remove_interm_progress.awk $1 > $1.thin

View File

@@ -0,0 +1,65 @@
git revision: 9e386917a
Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
CPU Info Linux: Neoverse-N1
CPU Cores Linux: 80
CPU Info Mac:
CPU Cores Mac:
Spec: MCProposerAcceptorReconfig.tla
Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg
----
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2}
max_term = 2
max_entries = 2
max_generation = 3
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ElectionSafetyFull
LogIsMonotonic
LogSafety
\* CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias
----
TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
Running breadth-first search Model-Checking with fp 99 and seed -9189733667206762985 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 391272] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla
Parsing file /tmp/tlc-3211535543066978921/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla
Parsing file /tmp/tlc-3211535543066978921/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
Parsing file /tmp/tlc-3211535543066978921/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
Parsing file /tmp/tlc-3211535543066978921/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
Parsing file /tmp/tlc-3211535543066978921/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
Parsing file /tmp/tlc-3211535543066978921/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla
Parsing file /tmp/tlc-3211535543066978921/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
Semantic processing of module Naturals
Semantic processing of module Sequences
Semantic processing of module FiniteSets
Semantic processing of module TLC
Semantic processing of module Integers
Semantic processing of module ProposerAcceptorStatic
Semantic processing of module ProposerAcceptorReconfig
Semantic processing of module TLCExt
Semantic processing of module _TLCTrace
Semantic processing of module MCProposerAcceptorReconfig
Starting... (2024-12-11 04:24:13)
Computing initial states...
Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:24:15.
Progress(16) at 2024-12-11 04:24:18: 1,427,589 states generated (1,427,589 s/min), 142,472 distinct states found (142,472 ds/min), 47,162 states left on queue.
Model checking completed. No error has been found.
Estimates of the probability that TLC did not check all reachable states
because two distinct states had the same fingerprint:
calculated (optimistic): val = 1.0E-6
based on the actual fingerprints: val = 4.2E-8
17746857 states generated, 1121659 distinct states found, 0 states left on queue.
The depth of the complete state graph search is 37.
The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
Finished in 33s at (2024-12-11 04:24:46)

View File

@@ -0,0 +1,64 @@
git revision: 9e386917a
Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
CPU Info Linux: Neoverse-N1
CPU Cores Linux: 80
CPU Info Mac:
CPU Cores Mac:
Spec: MCProposerAcceptorReconfig.tla
Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg
----
CONSTANTS
NULL = NULL
proposers = {p1, p2}
acceptors = {a1, a2}
max_term = 2
max_entries = 2
max_generation = 5
SPECIFICATION Spec
CONSTRAINT StateConstraint
INVARIANT
TypeOk
ElectionSafetyFull
LogIsMonotonic
LogSafety
\* CommittedNotTruncated
SYMMETRY ProposerAcceptorSymmetry
CHECK_DEADLOCK FALSE
ALIAS Alias
----
TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
Running breadth-first search Model-Checking with fp 114 and seed -8099467489737745861 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 392020] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla
Parsing file /tmp/tlc-11757875725969857497/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla
Parsing file /tmp/tlc-11757875725969857497/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
Parsing file /tmp/tlc-11757875725969857497/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
Parsing file /tmp/tlc-11757875725969857497/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
Parsing file /tmp/tlc-11757875725969857497/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
Parsing file /tmp/tlc-11757875725969857497/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla
Parsing file /tmp/tlc-11757875725969857497/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
Semantic processing of module Naturals
Semantic processing of module Sequences
Semantic processing of module FiniteSets
Semantic processing of module TLC
Semantic processing of module Integers
Semantic processing of module ProposerAcceptorStatic
Semantic processing of module ProposerAcceptorReconfig
Semantic processing of module TLCExt
Semantic processing of module _TLCTrace
Semantic processing of module MCProposerAcceptorReconfig
Starting... (2024-12-11 04:26:12)
Computing initial states...
Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:26:14.
Progress(14) at 2024-12-11 04:26:17: 1,519,385 states generated (1,519,385 s/min), 231,263 distinct states found (231,263 ds/min), 121,410 states left on queue.
Progress(20) at 2024-12-11 04:27:17: 42,757,204 states generated (41,237,819 s/min), 4,198,386 distinct states found (3,967,123 ds/min), 1,308,109 states left on queue.
Progress(22) at 2024-12-11 04:28:17: 83,613,929 states generated (40,856,725 s/min), 7,499,873 distinct states found (3,301,487 ds/min), 1,929,464 states left on queue.
Progress(23) at 2024-12-11 04:29:17: 124,086,758 states generated (40,472,829 s/min), 10,569,712 distinct states found (3,069,839 ds/min), 2,386,988 states left on queue.
Progress(24) at 2024-12-11 04:30:17: 163,412,538 states generated (39,325,780 s/min), 13,314,303 distinct states found (2,744,591 ds/min), 2,610,637 states left on queue.
Progress(25) at 2024-12-11 04:31:17: 202,643,708 states generated (39,231,170 s/min), 15,960,583 distinct states found (2,646,280 ds/min), 2,759,681 states left on queue.
Progress(26) at 2024-12-11 04:32:17: 240,681,633 states generated (38,037,925 s/min), 18,443,440 distinct states found (2,482,857 ds/min), 2,852,177 states left on queue.
Progress(27) at 2024-12-11 04:33:17: 278,559,134 states generated (37,877,501 s/min), 20,878,067 distinct states found (2,434,627 ds/min), 2,904,400 states left on queue.
Progress(28) at 2024-12-11 04:34:17: 316,699,911 states generated (38,140,777 s/min), 23,212,229 distinct states found (2,334,162 ds/min), 2,864,969 states left on queue.

View File

@@ -51,12 +51,10 @@ use utils::{
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
// TODO: disabled because concurrent CPU profiles cause seg faults. See:
// https://github.com/neondatabase/neon/issues/10225.
//#[allow(non_upper_case_globals)]
//#[export_name = "malloc_conf"]
//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
const PID_FILE_NAME: &str = "safekeeper.pid";
const ID_FILE_NAME: &str = "safekeeper.id";

View File

@@ -1,7 +1,6 @@
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::{Method, Url};
use serde::{de::DeserializeOwned, Serialize};
use std::str::FromStr;
pub struct Client {
base_url: Url,
@@ -31,16 +30,11 @@ impl Client {
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
let request_path = self
.base_url
.join(&path)
.expect("Failed to build request path");
let mut builder = self.client.request(method, request_path);
if let Some(body) = body {
builder = builder.json(&body)
}

View File

@@ -0,0 +1 @@
ALTER TABLE safekeepers DROP scheduling_policy;

View File

@@ -0,0 +1 @@
ALTER TABLE safekeepers ADD scheduling_policy VARCHAR NOT NULL DEFAULT 'disabled';

View File

@@ -3,7 +3,7 @@ use crate::metrics::{
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
METRICS_REGISTRY,
};
use crate::persistence::SafekeeperPersistence;
use crate::persistence::SafekeeperUpsert;
use crate::reconciler::ReconcileError;
use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
use anyhow::Context;
@@ -1249,7 +1249,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Infra)?;
let body = json_request::<SafekeeperPersistence>(&mut req).await?;
let body = json_request::<SafekeeperUpsert>(&mut req).await?;
let id = parse_request_param::<i64>(&req, "id")?;
if id != body.id {

View File

@@ -112,6 +112,14 @@ where
}
}
pub(crate) fn try_exclusive(&self, key: T, operation: I) -> Option<TracingExclusiveGuard<I>> {
let mut locked = self.entities.lock().unwrap();
let entry = locked.entry(key).or_default().clone();
let mut guard = TracingExclusiveGuard::new(entry.try_write_owned().ok()?);
*guard.guard = Some(operation);
Some(guard)
}
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
/// periodic housekeeping to avoid the map growing indefinitely
pub(crate) fn housekeeping(&self) {

Some files were not shown because too many files have changed in this diff Show More