mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-07 20:50:38 +00:00
Compare commits
61 Commits
release-co
...
arpad/log_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d58647f5d | ||
|
|
783c66ea0b | ||
|
|
ffeede085e | ||
|
|
bdca5b500b | ||
|
|
f4b03ddd7b | ||
|
|
08b19f001c | ||
|
|
1a45b2ec90 | ||
|
|
13e38a58a1 | ||
|
|
2edd59aefb | ||
|
|
0b639ba608 | ||
|
|
28f604d628 | ||
|
|
fe0ddb7169 | ||
|
|
4bbabc092a | ||
|
|
12c26243fc | ||
|
|
2f71eda00f | ||
|
|
5ec82105cc | ||
|
|
78a6daa874 | ||
|
|
5c0de4ee8c | ||
|
|
bc6a756f1c | ||
|
|
8f3351fa91 | ||
|
|
e7d18bc188 | ||
|
|
4ee0da0a20 | ||
|
|
7049003cf7 | ||
|
|
3915995530 | ||
|
|
5ea0bb2d4f | ||
|
|
aac1f8efb1 | ||
|
|
43dbded8c8 | ||
|
|
c848b995b2 | ||
|
|
4dee2bfd82 | ||
|
|
09ff22a4d4 | ||
|
|
8223c1ba9d | ||
|
|
3dad4698ec | ||
|
|
81e7218c27 | ||
|
|
a06c560ad0 | ||
|
|
477ab12b69 | ||
|
|
f9b05a42d7 | ||
|
|
29d73e1404 | ||
|
|
8a042fb8ed | ||
|
|
f72115d0a9 | ||
|
|
7458d031b1 | ||
|
|
38384c37ac | ||
|
|
2b2a547671 | ||
|
|
59e393aef3 | ||
|
|
f51ed4a2c4 | ||
|
|
4f16ab3f56 | ||
|
|
18796fd1dd | ||
|
|
2f3fc7cb57 | ||
|
|
e65d5f7369 | ||
|
|
55aef2993d | ||
|
|
1eef961f09 | ||
|
|
fc10bb9438 | ||
|
|
4b5c75b52f | ||
|
|
ca9d8761ff | ||
|
|
b568189f7b | ||
|
|
b94a5ce119 | ||
|
|
7ed4530618 | ||
|
|
3a44774227 | ||
|
|
b2705cfee6 | ||
|
|
225267b3ae | ||
|
|
d378726e38 | ||
|
|
436a117c15 |
@@ -33,6 +33,7 @@ workspace-members = [
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"json",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -7,6 +7,7 @@ self-hosted-runner:
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- unit-perf-aws-arm
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
249
.github/workflows/build-macos.yml
vendored
249
.github/workflows/build-macos.yml
vendored
@@ -32,162 +32,14 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-pgxn:
|
||||
if: |
|
||||
inputs.pg_versions != '[]' || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 30
|
||||
runs-on: macos-15
|
||||
strategy:
|
||||
matrix:
|
||||
postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout main repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set pg ${{ matrix.postgres-version }} for caching
|
||||
id: pg_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Cache postgres ${{ matrix.postgres-version }} build
|
||||
id: cache_pg
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
with:
|
||||
path: pg_install/${{ matrix.postgres-version }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init vendor/postgres-${{ matrix.postgres-version }}
|
||||
git submodule update --depth 1 --recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build Postgres ${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build Neon Pg Ext ${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: pg_install--${{ matrix.postgres-version }}
|
||||
path: pg_install/${{ matrix.postgres-version }}
|
||||
# The artifact is supposed to be used by the next job in the same workflow,
|
||||
# so there’s no need to store it for too long.
|
||||
retention-days: 1
|
||||
|
||||
build-walproposer-lib:
|
||||
if: |
|
||||
contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 30
|
||||
runs-on: macos-15
|
||||
needs: [build-pgxn]
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout main repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set pg v17 for caching
|
||||
id: pg_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Download "pg_install/v17" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v17
|
||||
path: pg_install/v17
|
||||
|
||||
# `actions/download-artifact` doesn't preserve permissions:
|
||||
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
|
||||
- name: Make pg_install/v*/bin/* executable
|
||||
run: |
|
||||
chmod +x pg_install/v*/bin/*
|
||||
|
||||
- name: Cache walproposer-lib
|
||||
id: cache_walproposer_lib
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
with:
|
||||
path: build/walproposer-lib
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Checkout submodule vendor/postgres-v17
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init vendor/postgres-v17
|
||||
git submodule update --depth 1 --recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build walproposer-lib (only for v17)
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run:
|
||||
make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1
|
||||
|
||||
- name: Upload "build/walproposer-lib" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: build--walproposer-lib
|
||||
path: build/walproposer-lib
|
||||
# The artifact is supposed to be used by the next job in the same workflow,
|
||||
# so there’s no need to store it for too long.
|
||||
retention-days: 1
|
||||
|
||||
cargo-build:
|
||||
make-all:
|
||||
if: |
|
||||
inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 30
|
||||
timeout-minutes: 60
|
||||
runs-on: macos-15
|
||||
needs: [build-pgxn, build-walproposer-lib]
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
@@ -203,41 +55,53 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Download "pg_install/v14" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v14
|
||||
path: pg_install/v14
|
||||
|
||||
- name: Download "pg_install/v15" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v15
|
||||
path: pg_install/v15
|
||||
|
||||
- name: Download "pg_install/v16" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v16
|
||||
path: pg_install/v16
|
||||
|
||||
- name: Download "pg_install/v17" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v17
|
||||
path: pg_install/v17
|
||||
|
||||
- name: Download "build/walproposer-lib" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: build--walproposer-lib
|
||||
path: build/walproposer-lib
|
||||
|
||||
# `actions/download-artifact` doesn't preserve permissions:
|
||||
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
|
||||
- name: Make pg_install/v*/bin/* executable
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
chmod +x pg_install/v*/bin/*
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Restore "pg_install/" cache
|
||||
id: cache_pg
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
with:
|
||||
path: pg_install
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-install-v14-${{ hashFiles('Makefile', 'postgres.mk', 'vendor/revisions.json') }}
|
||||
|
||||
- name: Checkout vendor/postgres submodules
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init
|
||||
git submodule update --depth 1 --recursive
|
||||
|
||||
- name: Build Postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres -j$(sysctl -n hw.ncpu)
|
||||
|
||||
# This isn't strictly necessary, but it makes the cached and non-cached builds more similar,
|
||||
# When pg_install is restored from cache, there is no 'build/' directory. By removing it
|
||||
# in a non-cached build too, we enforce that the rest of the steps don't depend on it,
|
||||
# so that we notice any build caching bugs earlier.
|
||||
- name: Remove build artifacts
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
rm -rf build
|
||||
|
||||
# Explicitly update the rust toolchain before running 'make'. The parallel make build can
|
||||
# invoke 'cargo build' more than once in parallel, for different crates. That's OK, 'cargo'
|
||||
# does its own locking to prevent concurrent builds from stepping on each other's
|
||||
# toes. However, it will first try to update the toolchain, and that step is not locked the
|
||||
# same way. To avoid two toolchain updates running in parallel and stepping on each other's
|
||||
# toes, ensure that the toolchain is up-to-date beforehand.
|
||||
- name: Update rust toolchain
|
||||
run: |
|
||||
rustup --version &&
|
||||
rustup update &&
|
||||
rustup show
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
@@ -249,17 +113,12 @@ jobs:
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release -j$(sysctl -n hw.ncpu)
|
||||
# Build the neon-specific postgres extensions, and all the Rust bits.
|
||||
#
|
||||
# Pass PG_INSTALL_CACHED=1 because PostgreSQL was already built and cached
|
||||
# separately.
|
||||
- name: Build all
|
||||
run: PG_INSTALL_CACHED=1 BUILD_TYPE=release make -j$(sysctl -n hw.ncpu) all
|
||||
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
23
.github/workflows/build_and_test.yml
vendored
23
.github/workflows/build_and_test.yml
vendored
@@ -87,6 +87,24 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
lint-openapi-spec:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- run: make lint-openapi-spec
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
@@ -306,14 +324,14 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -986,6 +1004,7 @@ jobs:
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
PARALLEL_COMPUTES: 3
|
||||
TAG: >-
|
||||
${{
|
||||
needs.meta.outputs.run-kind == 'compute-rc-pr'
|
||||
|
||||
4
.github/workflows/periodic_pagebench.yml
vendored
4
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic pagebench performance test on unit-perf hetzner runner
|
||||
name: Periodic pagebench performance test on unit-perf-aws-arm runners
|
||||
|
||||
on:
|
||||
schedule:
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
|
||||
4
.github/workflows/proxy-benchmark.yml
vendored
4
.github/workflows/proxy-benchmark.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic proxy performance test on unit-perf hetzner runner
|
||||
name: Periodic proxy performance test on unit-perf-aws-arm runners
|
||||
|
||||
on:
|
||||
push: # TODO: remove after testing
|
||||
@@ -32,7 +32,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [self-hosted, unit-perf]
|
||||
runs-on: [self-hosted, unit-perf-aws-arm]
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -15,6 +15,7 @@ neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
docker-compose/docker-compose-parallel.yml
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
63
Cargo.lock
generated
63
Cargo.lock
generated
@@ -1083,6 +1083,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.29.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1267,6 +1286,15 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cbindgen",
|
||||
"neon-shmem",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
@@ -1320,6 +1348,7 @@ dependencies = [
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres-types",
|
||||
"postgres_initdb",
|
||||
"postgres_versioninfo",
|
||||
"regex",
|
||||
@@ -3461,6 +3490,15 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "json"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"itoa",
|
||||
"ryu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "json-structural-diff"
|
||||
version = "0.2.0"
|
||||
@@ -4302,6 +4340,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"base64 0.22.1",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
@@ -4455,6 +4494,25 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -5647,6 +5705,8 @@ dependencies = [
|
||||
"azure_identity",
|
||||
"azure_storage",
|
||||
"azure_storage_blobs",
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -8665,8 +8725,10 @@ dependencies = [
|
||||
"fail",
|
||||
"form_urlencoded",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"getrandom 0.2.11",
|
||||
@@ -8693,6 +8755,7 @@ dependencies = [
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"p256 0.13.2",
|
||||
"parquet",
|
||||
"prettyplease",
|
||||
|
||||
@@ -8,6 +8,7 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
@@ -42,10 +43,12 @@ members = [
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
"libs/postgres_initdb",
|
||||
"libs/proxy/json",
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -255,6 +258,7 @@ desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
@@ -284,6 +288,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.29.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
53
Dockerfile
53
Dockerfile
@@ -30,7 +30,18 @@ ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
|
||||
|
||||
# Build Postgres
|
||||
# Naive way:
|
||||
#
|
||||
# 1. COPY . .
|
||||
# 1. make neon-pg-ext
|
||||
# 2. cargo build <storage binaries>
|
||||
#
|
||||
# But to enable docker to cache intermediate layers, we perform a few preparatory steps:
|
||||
#
|
||||
# - Build all postgres versions, depending on just the contents of vendor/
|
||||
# - Use cargo chef to build all rust dependencies
|
||||
|
||||
# 1. Build all postgres versions
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -38,17 +49,15 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot postgres.mk postgres.mk
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE=release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
|
||||
&& mold -run make -j $(nproc) -s postgres
|
||||
|
||||
# Prepare cargo-chef recipe
|
||||
# 2. Prepare cargo-chef recipe
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS plan
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -56,23 +65,22 @@ COPY --chown=nonroot . .
|
||||
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# Build neon binaries
|
||||
# Main build image
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS=""
|
||||
|
||||
# 3. Build cargo dependencies. Note that this step doesn't depend on anything else than
|
||||
# `recipe.json`, so the layer can be reused as long as none of the dependencies change.
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
RUN set -e \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
|
||||
|
||||
# Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build'
|
||||
# layer, and the cargo dependencies built in the previous step.
|
||||
COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
RUN set -e \
|
||||
@@ -87,10 +95,10 @@ RUN set -e \
|
||||
--bin endpoint_storage \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release
|
||||
--locked --release \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext
|
||||
|
||||
# Build final image
|
||||
#
|
||||
# Assemble the final image
|
||||
FROM $BASE_IMAGE_SHA
|
||||
WORKDIR /data
|
||||
|
||||
@@ -130,12 +138,15 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
COPY --from=build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=build /home/nonroot/pg_install/v17 /usr/local/v17/
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
# Deprecated: Old deployment scripts use this tarball which contains all the Postgres binaries.
|
||||
# That's obsolete, since all the same files are also present under /usr/local/v*. But to keep the
|
||||
# old scripts working for now, create the tarball.
|
||||
RUN tar -C /usr/local -cvzf /data/postgres_install.tar.gz v14 v15 v16 v17
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
|
||||
25
Makefile
25
Makefile
@@ -30,11 +30,18 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_CFLAGS += -O2 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
CARGO_PROFILE ?= --profile=release
|
||||
# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
|
||||
# the final build artifacts. There is unfortunately no easy way of changing
|
||||
# it to a fully predictable path, nor to extract the path with a simple
|
||||
# command. See https://github.com/rust-lang/cargo/issues/9661 and
|
||||
# https://github.com/rust-lang/cargo/issues/6790.
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
CARGO_PROFILE ?= --profile=dev
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -102,7 +109,7 @@ all: neon postgres-install neon-pg-ext
|
||||
|
||||
### Neon Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
# The 'postgres_ffi' crate depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers-install walproposer-lib cargo-target-dir
|
||||
+@echo "Compiling Neon"
|
||||
@@ -115,10 +122,13 @@ cargo-target-dir:
|
||||
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-install-%
|
||||
neon-pg-ext-%: postgres-install-% cargo-target-dir
|
||||
+@echo "Compiling neon-specific Postgres extensions for $*"
|
||||
mkdir -p $(BUILD_DIR)/pgxn-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
|
||||
CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
|
||||
CARGO_PROFILE="$(CARGO_PROFILE)" \
|
||||
-C $(BUILD_DIR)/pgxn-$*\
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/Makefile install
|
||||
|
||||
@@ -210,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
.PHONY: lint-openapi-spec
|
||||
lint-openapi-spec:
|
||||
# operation-2xx-response: pageserver timeline delete returns 404 on success
|
||||
find . -iname "openapi_spec.y*ml" -exec\
|
||||
docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
|
||||
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
|
||||
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
|
||||
lint {} \+
|
||||
|
||||
# Targets for building PostgreSQL are defined in postgres.mk.
|
||||
#
|
||||
# But if the caller has indicated that PostgreSQL is already
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
|
||||
# tokio-epoll-uring:
|
||||
# - allow-invalid because the method doesn't exist on macOS
|
||||
{ path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
|
||||
@@ -1636,11 +1636,14 @@ RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS neon-ext-build
|
||||
FROM pg-build-with-cargo AS neon-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
COPY pgxn/ pgxn/
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute
|
||||
USER root
|
||||
COPY . .
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute \
|
||||
BUILD_TYPE=release CARGO_BUILD_FLAGS="--locked --release" NEON_CARGO_ARTIFACT_TARGET_DIR="$(pwd)/target/release"
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1912,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /e
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
ENV PGHOST=compute1
|
||||
ENV PGPORT=55433
|
||||
ENV PGUSER=cloud_admin
|
||||
ENV PGDATABASE=postgres
|
||||
|
||||
@@ -66,7 +66,7 @@ url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
postgres-types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
compute_api.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
@@ -29,7 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::spawn;
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -107,6 +107,8 @@ pub struct ComputeNodeParams {
|
||||
pub installed_extensions_collection_interval: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
type TaskHandle = Mutex<Option<JoinHandle<()>>>;
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub params: ComputeNodeParams,
|
||||
@@ -129,7 +131,8 @@ pub struct ComputeNode {
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
/// Handle to the extension stats collection task
|
||||
extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
|
||||
extension_stats_task: TaskHandle,
|
||||
lfc_offload_task: TaskHandle,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -171,6 +174,7 @@ pub struct ComputeState {
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
/// mode == ComputeMode::Primary. None otherwise
|
||||
pub terminate_flush_lsn: Option<Lsn>,
|
||||
pub promote_state: Option<watch::Receiver<PromoteState>>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
@@ -188,6 +192,7 @@ impl ComputeState {
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,7 +373,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
|
||||
|
||||
struct PostgresHandle {
|
||||
postgres: std::process::Child,
|
||||
log_collector: tokio::task::JoinHandle<Result<()>>,
|
||||
log_collector: JoinHandle<Result<()>>,
|
||||
}
|
||||
|
||||
impl PostgresHandle {
|
||||
@@ -382,7 +387,7 @@ struct StartVmMonitorResult {
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
vm_monitor: Option<JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
@@ -433,6 +438,7 @@ impl ComputeNode {
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
compute_ctl_config: config.compute_ctl_config,
|
||||
extension_stats_task: Mutex::new(None),
|
||||
lfc_offload_task: Mutex::new(None),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -520,8 +526,8 @@ impl ComputeNode {
|
||||
None
|
||||
};
|
||||
|
||||
// Terminate the extension stats collection task
|
||||
this.terminate_extension_stats_task();
|
||||
this.terminate_lfc_offload_task();
|
||||
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
@@ -851,12 +857,15 @@ impl ComputeNode {
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
|
||||
|
||||
// Spawn the extension stats background task
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.autoprewarm {
|
||||
info!("autoprewarming on startup as requested");
|
||||
self.prewarm_lfc(None);
|
||||
}
|
||||
if let Some(seconds) = pspec.spec.offload_lfc_interval_seconds {
|
||||
self.spawn_lfc_offload_task(Duration::from_secs(seconds.into()));
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1049,7 +1058,7 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_connstr,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
@@ -2357,10 +2366,7 @@ LIMIT 100",
|
||||
}
|
||||
|
||||
pub fn spawn_extension_stats_task(&self) {
|
||||
// Cancel any existing task
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
}
|
||||
self.terminate_extension_stats_task();
|
||||
|
||||
let conf = self.tokio_conn_conf.clone();
|
||||
let atomic_interval = self.params.installed_extensions_collection_interval.clone();
|
||||
@@ -2396,8 +2402,30 @@ LIMIT 100",
|
||||
}
|
||||
|
||||
fn terminate_extension_stats_task(&self) {
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
if let Some(h) = self.extension_stats_task.lock().unwrap().take() {
|
||||
h.abort()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
|
||||
self.terminate_lfc_offload_task();
|
||||
let secs = interval.as_secs();
|
||||
info!("spawning lfc offload worker with {secs}s interval");
|
||||
let this = self.clone();
|
||||
let handle = spawn(async move {
|
||||
let mut interval = time::interval(interval);
|
||||
interval.tick().await; // returns immediately
|
||||
loop {
|
||||
interval.tick().await;
|
||||
this.offload_lfc_async().await;
|
||||
}
|
||||
});
|
||||
*self.lfc_offload_task.lock().unwrap() = Some(handle);
|
||||
}
|
||||
|
||||
fn terminate_lfc_offload_task(&self) {
|
||||
if let Some(h) = self.lfc_offload_task.lock().unwrap().take() {
|
||||
h.abort()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2406,19 +2434,11 @@ LIMIT 100",
|
||||
// If the value is -1, we never suspend so set the value to default collection.
|
||||
// If the value is 0, it means default, we will just continue to use the default.
|
||||
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
|
||||
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
|
||||
spec.suspend_timeout_seconds
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
spec.suspend_timeout_seconds as u64,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
|
||||
@@ -5,6 +5,7 @@ use compute_api::responses::LfcOffloadState;
|
||||
use compute_api::responses::LfcPrewarmState;
|
||||
use http::StatusCode;
|
||||
use reqwest::Client;
|
||||
use std::mem::replace;
|
||||
use std::sync::Arc;
|
||||
use tokio::{io::AsyncReadExt, spawn};
|
||||
use tracing::{error, info};
|
||||
@@ -88,17 +89,15 @@ impl ComputeNode {
|
||||
self.state.lock().unwrap().lfc_offload_state.clone()
|
||||
}
|
||||
|
||||
/// Returns false if there is a prewarm request ongoing, true otherwise
|
||||
/// If there is a prewarm request ongoing, return false, true otherwise
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
crate::metrics::LFC_PREWARM_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
if let LfcPrewarmState::Prewarming =
|
||||
std::mem::replace(state, LfcPrewarmState::Prewarming)
|
||||
{
|
||||
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
crate::metrics::LFC_PREWARMS.inc();
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
@@ -106,7 +105,8 @@ impl ComputeNode {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "prewarming lfc");
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
@@ -152,32 +152,42 @@ impl ComputeNode {
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// Returns false if there is an offload request ongoing, true otherwise
|
||||
/// If offload request is ongoing, return false, true otherwise
|
||||
pub fn offload_lfc(self: &Arc<Self>) -> bool {
|
||||
crate::metrics::LFC_OFFLOAD_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if let LfcOffloadState::Offloading =
|
||||
std::mem::replace(state, LfcOffloadState::Offloading)
|
||||
{
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.offload_lfc_impl().await else {
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
});
|
||||
spawn(async move { cloned.offload_lfc_with_state_update().await });
|
||||
true
|
||||
}
|
||||
|
||||
pub async fn offload_lfc_async(self: &Arc<Self>) {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
return;
|
||||
}
|
||||
}
|
||||
self.offload_lfc_with_state_update().await
|
||||
}
|
||||
|
||||
async fn offload_lfc_with_state_update(&self) {
|
||||
crate::metrics::LFC_OFFLOADS.inc();
|
||||
let Err(err) = self.offload_lfc_impl().await else {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "offloading lfc");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from postgres");
|
||||
|
||||
132
compute_tools/src/compute_promote.rs
Normal file
132
compute_tools/src/compute_promote.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::{
|
||||
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
|
||||
spec::ComputeMode,
|
||||
};
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
/// Returns only when promote fails or succeeds. If a network error occurs
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move {
|
||||
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
|
||||
Ok(_) => PromoteState::Completed,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
rx
|
||||
};
|
||||
|
||||
let mut task;
|
||||
// self.state is unlocked after block ends so we lock it in promote_impl
|
||||
// and task.changed() is reached
|
||||
{
|
||||
task = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.promote_state
|
||||
.get_or_insert_with(start_promotion)
|
||||
.clone()
|
||||
}
|
||||
task.changed().await.expect("promote sender dropped");
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
// Why do we have to supply safekeepers?
|
||||
// For secondary we use primary_connection_conninfo so safekeepers field is empty
|
||||
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
if *mode != ComputeMode::Replica {
|
||||
bail!("{} is not replica", mode.to_type_str());
|
||||
}
|
||||
|
||||
// we don't need to query Postgres so not self.lfc_prewarm_state()
|
||||
match &state.lfc_prewarm_state {
|
||||
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
|
||||
bail!("prewarm not requested or pending")
|
||||
}
|
||||
LfcPrewarmState::Failed { error } => {
|
||||
tracing::warn!(%error, "replica prewarm failed")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
|
||||
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
last_wal_replay_lsn = lsn.into();
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
bail!("didn't catch up with primary in {RETRIES} retries");
|
||||
}
|
||||
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
"ALTER SYSTEM SET neon.safekeepers='{}'",
|
||||
safekeepers_lsn.safekeepers
|
||||
);
|
||||
client
|
||||
.query(&safekeepers_sql, &[])
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
bail!("pg_promote() returned false");
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let row = client
|
||||
.query_one("SHOW transaction_read_only", &[])
|
||||
.await
|
||||
.context("getting transaction_read_only")?;
|
||||
if row.get::<usize, &str>(0) == "on" {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -83,6 +83,87 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/promote:
|
||||
post:
|
||||
tags:
|
||||
- Promotion
|
||||
summary: Promote secondary replica to primary
|
||||
description: ""
|
||||
operationId: promoteReplica
|
||||
requestBody:
|
||||
description: Promote requests data
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SafekeepersLsn"
|
||||
responses:
|
||||
200:
|
||||
description: Promote succeeded or wasn't started
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
500:
|
||||
description: Promote failed
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
|
||||
/lfc/prewarm:
|
||||
post:
|
||||
summary: Request LFC Prewarm
|
||||
parameters:
|
||||
- name: from_endpoint
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: ""
|
||||
operationId: lfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: LFC prewarm started
|
||||
429:
|
||||
description: LFC prewarm ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC prewarm state
|
||||
description: ""
|
||||
operationId: getLfcPrewarmState
|
||||
responses:
|
||||
200:
|
||||
description: Prewarm state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
summary: Request LFC offload
|
||||
description: ""
|
||||
operationId: lfcOffload
|
||||
responses:
|
||||
202:
|
||||
description: LFC offload started
|
||||
429:
|
||||
description: LFC offload ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC offloading state
|
||||
description: ""
|
||||
operationId: getLfcOffloadState
|
||||
responses:
|
||||
200:
|
||||
description: Offload state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcOffloadState"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
@@ -335,15 +416,6 @@ components:
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod.
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
@@ -497,25 +569,69 @@ components:
|
||||
type: string
|
||||
example: "1.0.0"
|
||||
|
||||
InstalledExtensions:
|
||||
SafekeepersLsn:
|
||||
type: object
|
||||
required:
|
||||
- safekeepers
|
||||
- wal_flush_lsn
|
||||
properties:
|
||||
extensions:
|
||||
description: Contains list of installed extensions.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
version:
|
||||
type: string
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
owned_by_superuser:
|
||||
type: integer
|
||||
safekeepers:
|
||||
description: Primary replica safekeepers
|
||||
type: string
|
||||
wal_flush_lsn:
|
||||
description: Primary last WAL flush LSN
|
||||
type: string
|
||||
|
||||
LfcPrewarmState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- total
|
||||
- prewarmed
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: Lfc prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc prewarm error, if any
|
||||
type: string
|
||||
total:
|
||||
description: Total pages processed
|
||||
type: integer
|
||||
prewarmed:
|
||||
description: Total pages prewarmed
|
||||
type: integer
|
||||
skipped:
|
||||
description: Pages processed but not prewarmed
|
||||
type: integer
|
||||
|
||||
LfcOffloadState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Lfc offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc offload error, if any
|
||||
type: string
|
||||
|
||||
PromoteState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Promote result
|
||||
enum: [not_promoted, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Promote error, if any
|
||||
type: string
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
|
||||
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod promote;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
|
||||
14
compute_tools/src/http/routes/promote.rs
Normal file
14
compute_tools/src/http/routes/promote.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::Form;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
|
||||
) -> axum::response::Response {
|
||||
let state = compute.promote(safekeepers_lsn).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error } = state {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
|
||||
}
|
||||
JsonResponse::success(StatusCode::OK, state)
|
||||
}
|
||||
@@ -23,7 +23,7 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, lfc, metrics, metrics_json, status, terminate,
|
||||
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
|
||||
@@ -12,6 +12,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod compute_prewarm;
|
||||
pub mod compute_promote;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
|
||||
@@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc(
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
tenant_shard_id.tenant_id,
|
||||
timeline_id,
|
||||
|
||||
@@ -97,20 +97,34 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm.
|
||||
/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm
|
||||
pub(crate) static LFC_PREWARM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_prewarm_requests_total",
|
||||
"Total number of LFC prewarm requests made by compute_ctl",
|
||||
"compute_ctl_lfc_prewarms_total",
|
||||
"Total number of LFC prewarms requested by compute_ctl or autoprewarm option",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offload_requests_total",
|
||||
"Total number of LFC offload requests made by compute_ctl",
|
||||
"compute_ctl_lfc_prewarm_errors_total",
|
||||
"Total number of LFC prewarm errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offloads_total",
|
||||
"Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offload_errors_total",
|
||||
"Total number of LFC offload errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -124,7 +138,9 @@ pub fn collect() -> Vec<MetricFamily> {
|
||||
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
|
||||
metrics.extend(PG_CURR_DOWNTIME_MS.collect());
|
||||
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
|
||||
metrics.extend(LFC_PREWARM_REQUESTS.collect());
|
||||
metrics.extend(LFC_OFFLOAD_REQUESTS.collect());
|
||||
metrics.extend(LFC_PREWARMS.collect());
|
||||
metrics.extend(LFC_PREWARM_ERRORS.collect());
|
||||
metrics.extend(LFC_OFFLOADS.collect());
|
||||
metrics.extend(LFC_OFFLOAD_ERRORS.collect());
|
||||
metrics
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -7,13 +7,17 @@ BEGIN
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
AND member = 'pg_monitor'::regrole;
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF NOT monitor.member THEN
|
||||
IF monitor IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF monitor.admin IS NULL OR NOT monitor.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF NOT monitor.admin THEN
|
||||
IF monitor.admin IS NULL OR NOT monitor.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
signal_backend record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO signal_backend
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_signal_backend'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF signal_backend IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations)
|
||||
|
||||
@@ -31,6 +31,7 @@ mod pg_helpers_tests {
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
autoprewarm = off
|
||||
offload_lfc_interval_seconds = 20
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
|
||||
@@ -675,6 +675,16 @@ struct EndpointStartCmdArgs {
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Download LFC cache from endpoint storage on endpoint startup",
|
||||
default_value = "false"
|
||||
)]
|
||||
autoprewarm: bool,
|
||||
|
||||
#[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
|
||||
offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Run in development mode, skipping VM-specific operations like process termination",
|
||||
@@ -1585,22 +1595,24 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let endpoint_storage_token = env.generate_auth_token(&claims)?;
|
||||
let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string();
|
||||
|
||||
let args = control_plane::endpoint::EndpointStartArgs {
|
||||
auth_token,
|
||||
endpoint_storage_token,
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
start_timeout: args.start_timeout,
|
||||
autoprewarm: args.autoprewarm,
|
||||
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
||||
dev: args.dev,
|
||||
};
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
endpoint_storage_token,
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_base_url.as_ref(),
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
args.dev,
|
||||
)
|
||||
.await?;
|
||||
endpoint.start(args).await?;
|
||||
}
|
||||
EndpointCmd::Reconfigure(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
|
||||
@@ -373,6 +373,22 @@ impl std::fmt::Display for EndpointTerminateMode {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EndpointStartArgs {
|
||||
pub auth_token: Option<String>,
|
||||
pub endpoint_storage_token: String,
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
pub start_timeout: Duration,
|
||||
pub autoprewarm: bool,
|
||||
pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
pub dev: bool,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -677,21 +693,7 @@ impl Endpoint {
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
endpoint_storage_token: String,
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
dev: bool,
|
||||
) -> Result<()> {
|
||||
pub async fn start(&self, args: EndpointStartArgs) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -704,10 +706,10 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
@@ -735,7 +737,7 @@ impl Endpoint {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
roles: if args.create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
@@ -744,7 +746,7 @@ impl Endpoint {
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
databases: if args.create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
@@ -766,20 +768,21 @@ impl Endpoint {
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
shard_stripe_size: Some(args.shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
autoprewarm: false,
|
||||
endpoint_storage_addr: Some(args.endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(args.endpoint_storage_token),
|
||||
autoprewarm: args.autoprewarm,
|
||||
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
||||
suspend_timeout_seconds: -1, // Only used in neon_local.
|
||||
};
|
||||
|
||||
@@ -791,7 +794,7 @@ impl Endpoint {
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
if args.create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
@@ -826,7 +829,7 @@ impl Endpoint {
|
||||
// Launch compute_ctl
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{conn_str}'");
|
||||
if create_test_user {
|
||||
if args.create_test_user {
|
||||
let conn_str = self.connstr("test", "neondb");
|
||||
println!("Also at '{conn_str}'");
|
||||
}
|
||||
@@ -858,11 +861,11 @@ impl Endpoint {
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_base_url) = remote_ext_base_url {
|
||||
cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
|
||||
if let Some(remote_ext_base_url) = args.remote_ext_base_url {
|
||||
cmd.args(["--remote-ext-base-url", &remote_ext_base_url]);
|
||||
}
|
||||
|
||||
if dev {
|
||||
if args.dev {
|
||||
cmd.arg("--dev");
|
||||
}
|
||||
|
||||
@@ -894,10 +897,11 @@ impl Endpoint {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
let timeout = args.start_timeout;
|
||||
if Instant::now().duration_since(start_at) > timeout {
|
||||
bail!(
|
||||
"compute startup timed out {:?}; still in Init state",
|
||||
start_timeout
|
||||
timeout
|
||||
);
|
||||
}
|
||||
// keep retrying
|
||||
@@ -925,9 +929,10 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
if Instant::now().duration_since(start_at) > args.start_timeout {
|
||||
return Err(e).context(format!(
|
||||
"timed out {start_timeout:?} waiting to connect to compute_ctl HTTP",
|
||||
"timed out {:?} waiting to connect to compute_ctl HTTP",
|
||||
args.start_timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,6 +452,12 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
// HADRON
|
||||
image_layer_force_creation_period: settings
|
||||
.remove("image_layer_force_creation_period")
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_layer_force_creation_period' as duration")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
|
||||
@@ -65,12 +65,27 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
// Set a node status as deleted.
|
||||
/// Exists for backup usage and will be removed in future.
|
||||
/// Use [`Command::NodeStartDelete`] instead, if possible.
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Start deletion of the specified pageserver.
|
||||
NodeStartDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel deletion of the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
NodeCancelDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Delete a tombstone of node from the storage controller.
|
||||
/// This is used when we want to allow the node to be re-registered.
|
||||
NodeDeleteTombstone {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
@@ -912,10 +927,43 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDelete { node_id } => {
|
||||
eprintln!("Warning: This command is obsolete and will be removed in a future version");
|
||||
eprintln!("Use `NodeStartDelete` instead, if possible");
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeStartDelete { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/delete"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Delete started for {node_id}");
|
||||
}
|
||||
Command::NodeCancelDelete { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/delete"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
!matches!(sched, NodeSchedulingPolicy::Deleting)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Delete was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::NodeDeleteTombstone { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
|
||||
@@ -54,14 +54,16 @@ else
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then
|
||||
echo "Check if a timeline present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
fi
|
||||
if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
|
||||
@@ -142,7 +142,7 @@ services:
|
||||
- "storage_broker"
|
||||
- "--listen-addr=0.0.0.0:50051"
|
||||
|
||||
compute:
|
||||
compute1:
|
||||
restart: always
|
||||
build:
|
||||
context: ./compute_wrapper/
|
||||
@@ -152,6 +152,7 @@ services:
|
||||
- TAG=${COMPUTE_TAG:-${TAG:-latest}}
|
||||
- http_proxy=${http_proxy:-}
|
||||
- https_proxy=${https_proxy:-}
|
||||
image: built-compute
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-16}
|
||||
- TENANT_ID=${TENANT_ID:-}
|
||||
@@ -166,6 +167,11 @@ services:
|
||||
- 3080:3080 # http endpoints
|
||||
entrypoint:
|
||||
- "/shell/compute.sh"
|
||||
# Ad an alias for compute1 for compatibility
|
||||
networks:
|
||||
default:
|
||||
aliases:
|
||||
- compute
|
||||
depends_on:
|
||||
- safekeeper1
|
||||
- safekeeper2
|
||||
@@ -174,15 +180,20 @@ services:
|
||||
|
||||
compute_is_ready:
|
||||
image: postgres:latest
|
||||
environment:
|
||||
- PARALLEL_COMPUTES=1
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "until pg_isready -h compute -p 55433 -U cloud_admin ; do
|
||||
echo 'Waiting to start compute...' && sleep 1;
|
||||
done"
|
||||
- "for i in $(seq 1 $${PARALLEL_COMPUTES}); do
|
||||
until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do
|
||||
sleep 1;
|
||||
done;
|
||||
done;
|
||||
echo All computes are started"
|
||||
depends_on:
|
||||
- compute
|
||||
- compute1
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
@@ -196,4 +207,4 @@ services:
|
||||
command:
|
||||
- sleep 3600
|
||||
depends_on:
|
||||
- compute
|
||||
- compute1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# A basic test to ensure Docker images are built correctly.
|
||||
# Build a wrapper around the compute, start all services and runs a simple SQL query.
|
||||
@@ -13,9 +13,36 @@
|
||||
#
|
||||
set -eux -o pipefail
|
||||
|
||||
cd "$(dirname "${0}")"
|
||||
export COMPOSE_FILE='docker-compose.yml'
|
||||
export COMPOSE_PROFILES=test-extensions
|
||||
cd "$(dirname "${0}")"
|
||||
export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
|
||||
READY_MESSAGE="All computes are started"
|
||||
COMPUTES=()
|
||||
for i in $(seq 1 "${PARALLEL_COMPUTES}"); do
|
||||
COMPUTES+=("compute${i}")
|
||||
done
|
||||
CURRENT_TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT
|
||||
if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
|
||||
export COMPOSE_FILE=docker-compose-parallel.yml
|
||||
cp docker-compose.yml docker-compose-parallel.yml
|
||||
# Replace the environment variable PARALLEL_COMPUTES with the actual value
|
||||
yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE}
|
||||
for i in $(seq 2 "${PARALLEL_COMPUTES}"); do
|
||||
# Duplicate compute1 as compute${i} for parallel execution
|
||||
yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE}
|
||||
# We don't need these sections, so delete them
|
||||
yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE}
|
||||
# Let the compute 1 be the only dependence
|
||||
yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE}
|
||||
# Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes
|
||||
yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE}
|
||||
# Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes
|
||||
# They will create new TENANT_ID and TIMELINE_ID anyway.
|
||||
yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE}
|
||||
done
|
||||
fi
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
|
||||
|
||||
function cleanup() {
|
||||
@@ -27,11 +54,11 @@ function cleanup() {
|
||||
|
||||
for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
pg_version=${pg_version/v/}
|
||||
echo "clean up containers if exists"
|
||||
echo "clean up containers if exist"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d
|
||||
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 3; do
|
||||
@@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
echo "timeout before the compute is ready."
|
||||
exit 1
|
||||
fi
|
||||
if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then
|
||||
if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
|
||||
for compute in "${COMPUTES[@]}"; do
|
||||
docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
|
||||
done
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${pg_version} -ge 16 ]]; then
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
|
||||
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
|
||||
docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block does the same for the contrib/file_fdw test
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
|
||||
docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
|
||||
rm -rf "${TMPDIR}"
|
||||
mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src}
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install"
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data"
|
||||
docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data"
|
||||
|
||||
for compute in "${COMPUTES[@]}"; do
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config on "${compute}"
|
||||
docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/
|
||||
# The following block does the same for the contrib/file_fdw test
|
||||
docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data
|
||||
done
|
||||
# Apply patches
|
||||
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
# We want to run the longest tests first to better utilize parallelization and reduce overall test time.
|
||||
# Tests listed in the RUN_FIRST variable will be run before others.
|
||||
# If parallelization is not used, this environment variable will be ignored.
|
||||
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
-e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
|
||||
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
-e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
|
||||
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
|
||||
CONTRIB_FAILED=
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -x
|
||||
|
||||
if [[ -v BENCHMARK_CONNSTR ]]; then
|
||||
@@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then
|
||||
fi
|
||||
fi
|
||||
REGULAR_USER=false
|
||||
while getopts r arg; do
|
||||
case $arg in
|
||||
PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
|
||||
while getopts pr arg; do
|
||||
case ${arg} in
|
||||
r)
|
||||
REGULAR_USER=true
|
||||
shift $((OPTIND-1))
|
||||
@@ -41,26 +42,49 @@ extdir=${1}
|
||||
|
||||
cd "${extdir}" || exit 2
|
||||
FAILED=
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
|
||||
for d in ${LIST}; do
|
||||
[ -d "${d}" ] || continue
|
||||
if ! psql -w -c "select 1" >/dev/null; then
|
||||
FAILED="${d} ${FAILED}"
|
||||
break
|
||||
fi
|
||||
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
|
||||
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
|
||||
continue
|
||||
fi
|
||||
export FAILED_FILE=/tmp/failed
|
||||
rm -f ${FAILED_FILE}
|
||||
mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
|
||||
if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
|
||||
# Avoid errors if RUN_FIRST is not defined
|
||||
RUN_FIRST=${RUN_FIRST:-}
|
||||
# Move entries listed in the RUN_FIRST variable to the beginning
|
||||
ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"))
|
||||
parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0
|
||||
export PGHOST=compute{%}
|
||||
if ! psql -c 'select 1'>/dev/null; then
|
||||
exit 1
|
||||
fi
|
||||
echo Running on \${PGHOST}
|
||||
if [[ -f ${extdir}/{}/neon-test.sh ]]; then
|
||||
echo Running from script
|
||||
${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE};
|
||||
else
|
||||
echo Running using make;
|
||||
USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE};
|
||||
fi" ::: ${ORDERED_LIST}
|
||||
[[ ! -f ${FAILED_FILE} ]] && exit 0
|
||||
else
|
||||
for d in "${LIST[@]}"; do
|
||||
[ -d "${d}" ] || continue
|
||||
if ! psql -w -c "select 1" >/dev/null; then
|
||||
FAILED="${d} ${FAILED}"
|
||||
break
|
||||
fi
|
||||
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
|
||||
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -f "${d}/neon-test.sh" ]; then
|
||||
"${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
|
||||
else
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
fi
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
for d in ${FAILED}; do
|
||||
if [ -f "${d}/neon-test.sh" ]; then
|
||||
"${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
|
||||
else
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
fi
|
||||
done
|
||||
[[ -z ${FAILED} ]] && exit 0
|
||||
fi
|
||||
for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do
|
||||
cat "$(find $d -name regression.diffs)"
|
||||
done
|
||||
for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
@@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
cat "${postgis_diff}"
|
||||
done
|
||||
echo "${FAILED}"
|
||||
cat ${FAILED_FILE}
|
||||
exit 1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux -o pipefail
|
||||
cd "$(dirname "${0}")"
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
@@ -60,8 +60,8 @@ function check_timeline() {
|
||||
# Restarts the compute node with the required compute tag and timeline.
|
||||
# Accepts the tag for the compute node and the timeline as parameters.
|
||||
function restart_compute() {
|
||||
docker compose down compute compute_is_ready
|
||||
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
|
||||
docker compose down compute1 compute_is_ready
|
||||
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready
|
||||
wait_for_ready
|
||||
check_timeline ${2}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
|
||||
up all computes for the change. Neither we want to fully reimplement the leader
|
||||
logic second time outside compute. Because of that the proposed algorithm relies
|
||||
for issuing configurations on the external fault tolerant (distributed) strongly
|
||||
consisent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
consistent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
Properly configured postgres suits this.
|
||||
|
||||
In the system consensus is implemented at the timeline level, so algorithm below
|
||||
@@ -34,7 +34,7 @@ A configuration is
|
||||
|
||||
```
|
||||
struct Configuration {
|
||||
generation: Generation, // a number uniquely identifying configuration
|
||||
generation: SafekeeperGeneration, // a number uniquely identifying configuration
|
||||
sk_set: Vec<NodeId>, // current safekeeper set
|
||||
new_sk_set: Optional<Vec<NodeId>>,
|
||||
}
|
||||
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
struct ConfigurationSwitchResponse {
|
||||
struct TimelineMembershipSwitchResponse {
|
||||
conf: Configuration,
|
||||
term: Term,
|
||||
last_log_term: Term,
|
||||
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
|
||||
To be elected it must receive votes from both majorites if `new_sk_set` is present.
|
||||
To be elected it must receive votes from both majorities if `new_sk_set` is present.
|
||||
Similarly, to commit WAL it must receive flush acknowledge from both majorities.
|
||||
|
||||
If walproposer hears from safekeeper configuration higher than his own (i.e.
|
||||
@@ -130,7 +130,7 @@ storage are reachable.
|
||||
1) Fetch current timeline configuration from the configuration storage.
|
||||
2) If it is already joint one and `new_set` is different from `desired_set`
|
||||
refuse to change. However, assign join conf to (in memory) var
|
||||
`join_conf` and proceed to step 4 to finish the ongoing change.
|
||||
`joint_conf` and proceed to step 4 to finish the ongoing change.
|
||||
3) Else, create joint `joint_conf: Configuration`: increment current conf number
|
||||
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
|
||||
storage by doing CAS on the current generation: change happens only if
|
||||
@@ -161,11 +161,11 @@ storage are reachable.
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`sync_position`. Similarly, on the happy path no waiting is needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
@@ -178,12 +178,12 @@ spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
the simplest is the procedure restart. There are more complicated scenarios like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
@@ -192,7 +192,7 @@ considerations.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
current configuration is <= than one in request and safekeeper is not member of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
|
||||
@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
like it currently does. The call should be retried until it succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
When compute receives safekeeper list from control plane it needs to know the
|
||||
generation to check whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
@@ -305,8 +305,8 @@ enum MigrationRequest {
|
||||
```
|
||||
|
||||
`FinishPending` requests to run the procedure to ensure state is clean: current
|
||||
configuration is not joint and majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If current configuration fetched on step 1 is
|
||||
configuration is not joint and the majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If the current configuration fetched on step 1 is
|
||||
not joint it jumps to step 7. It should be run at startup for all timelines (but
|
||||
similarly, in the first version it is ok to trigger it manually).
|
||||
|
||||
@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
|
||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
||||
`scheduling_policy`: it is enough to have at least in the beginning only 3
|
||||
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
|
||||
3) `decomissioned` (node is removed).
|
||||
3) `decommissioned` (node is removed).
|
||||
|
||||
`timelines` table:
|
||||
```
|
||||
@@ -326,9 +326,10 @@ table! {
|
||||
tenant_id -> Varchar,
|
||||
start_lsn -> pg_lsn,
|
||||
generation -> Int4,
|
||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
||||
sk_set -> Array<Int8>, // list of safekeeper ids
|
||||
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
|
||||
cplane_notified_generation -> Int4,
|
||||
sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
|
||||
deleted_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
@@ -338,13 +339,23 @@ table! {
|
||||
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
|
||||
this RFC it is not needed.
|
||||
|
||||
`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
|
||||
track the last stage of the algorithm, when we need to notify safekeeper set and cplane
|
||||
with the final configuration after it's already committed to DB.
|
||||
|
||||
The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
|
||||
`*_notified_generation` fields are up to date with `generation`.
|
||||
|
||||
It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
|
||||
but for better observability it's nice to have them separately.
|
||||
|
||||
#### API
|
||||
|
||||
Node management is similar to pageserver:
|
||||
1) POST `/control/v1/safekeepers` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
||||
1) POST `/control/v1/safekeeper` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeeper` lists safekeepers.
|
||||
3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
|
||||
`offline` or `decomissioned`. Initially it is simpler not to schedule any
|
||||
migrations here.
|
||||
|
||||
@@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative:
|
||||
all timelines from one safekeeper to another. It accepts json
|
||||
```
|
||||
{
|
||||
"src_sk": u32,
|
||||
"dst_sk": u32,
|
||||
"src_sk": NodeId,
|
||||
"dst_sk": NodeId,
|
||||
"limit": Optional<u32>,
|
||||
}
|
||||
```
|
||||
@@ -379,12 +390,15 @@ Returns list of scheduled requests.
|
||||
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
|
||||
to move single timeline to given set of safekeepers:
|
||||
```
|
||||
{
|
||||
"desired_set": Vec<u32>,
|
||||
struct TimelineSafekeeperMigrateRequest {
|
||||
"new_sk_set": Vec<NodeId>,
|
||||
}
|
||||
```
|
||||
|
||||
Returns scheduled request.
|
||||
In the first version the handler migrates the timeline to `new_sk_set` synchronously.
|
||||
Should be retried until success.
|
||||
|
||||
In the future we might change it to asynchronous API and return scheduled request.
|
||||
|
||||
Similar call should be added for the tenant.
|
||||
|
||||
@@ -434,6 +448,9 @@ table! {
|
||||
}
|
||||
```
|
||||
|
||||
We load all pending ops from the table on startup into the memory.
|
||||
The table is needed only to preserve the state between restarts.
|
||||
|
||||
`op_type` can be `include` (seed from peers and ensure generation is up to
|
||||
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
|
||||
needed as it can be computed from current configuration, but gives more explicit
|
||||
@@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
|
||||
the initial (tenant creation) call cplane doesn't know it. However, setting
|
||||
start_lsn on safekeepers during creation is a good thing -- it provides a
|
||||
guarantee that walproposer can always find a common point in WAL histories of
|
||||
safekeeper and its own, and so absense of it would be a clear sign of
|
||||
safekeeper and its own, and so absence of it would be a clear sign of
|
||||
corruption. The following sequence works:
|
||||
1) Create timeline (or observe that it exists) on pageserver,
|
||||
figuring out last_record_lsn in response.
|
||||
@@ -497,11 +514,9 @@ corruption. The following sequence works:
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
case) would interact with per sk reconciler. In the current implementation
|
||||
we first persist the request in the DB, and then send an in-memory request
|
||||
to each safekeeper reconciler to process it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
table, it is not necessary as initial creation at step 3 can take them from
|
||||
@@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them
|
||||
around.
|
||||
|
||||
Timeline migration.
|
||||
1) CAS to the db to create joint conf, and in the same transaction create
|
||||
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
1) CAS to the db to create joint conf. Since this moment the migration is considered to be
|
||||
"in progress". We can detect all "in-progress" migrations looking into the database.
|
||||
2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
|
||||
configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
|
||||
so we don't need to persist anything in the database at this stage. If any errors occur,
|
||||
it's safe to retry or abort the migration.
|
||||
3) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
|
||||
in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
|
||||
CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
|
||||
need to have them persisted somewhere in case the migration is interrupted right after the CAS.
|
||||
4) Finish the migration. The final membership configuration is committed to the DB at this stage.
|
||||
So, the migration can not be aborted anymore. But it can still be retried if the migration fails
|
||||
past stage 3. To finish the migration we need to send the new membership configuration to
|
||||
a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
|
||||
requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
|
||||
possible that we have already committed `exclude` requests to DB, but didn't send them to
|
||||
the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
|
||||
because it's the only place where they are persistent. The fields `sk_set_notified_generation`
|
||||
and `cplane_notified_generation` are updated after each step. The migration is considered
|
||||
fully completed when they match the `generation` field.
|
||||
|
||||
In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
|
||||
reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
|
||||
so the timeline is always in a "good state" and doesn't require an old quorum to commit
|
||||
WAL after the migration reported "success".
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
|
||||
per sk reconcilers.
|
||||
|
||||
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
|
||||
When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
|
||||
for it must be cleared in the same transaction.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
|
||||
|
||||
#### Dealing with multiple instances of storage_controller
|
||||
|
||||
Operations described above executed concurrently might create some errors but do
|
||||
@@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
||||
|
||||
To harden against some controller instance creating some work in
|
||||
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
|
||||
the job per sk reconcilers apart from explicit wakups should scan for work
|
||||
the job per sk reconcilers apart from explicit wakeups should scan for work
|
||||
periodically. It is possible to remove that though if all db updates are
|
||||
protected with leadership token/term -- then such scans are needed only after
|
||||
leadership is acquired.
|
||||
@@ -563,7 +588,7 @@ There should be following layers of tests:
|
||||
safekeeper communication and pull_timeline need to be mocked and main switch
|
||||
procedure wrapped to as a node (thread) in simulation tests, using these
|
||||
mocks. Test would inject migrations like it currently injects
|
||||
safekeeper/walproposer restars. Main assert is the same -- committed WAL must
|
||||
safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
|
||||
not be lost.
|
||||
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
@@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout:
|
||||
`notify-safekeepers`.
|
||||
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- Current situation: safekeepers are chosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
|
||||
@@ -13,6 +13,8 @@ use utils::backoff::retry;
|
||||
pub fn app(state: Arc<Storage>) -> Router<()> {
|
||||
use axum::routing::{delete as _delete, get as _get};
|
||||
let delete_prefix = _delete(delete_prefix);
|
||||
// NB: On any changes do not forget to update the OpenAPI spec
|
||||
// in /endpoint_storage/src/openapi_spec.yml.
|
||||
Router::new()
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
|
||||
|
||||
146
endpoint_storage/src/openapi_spec.yml
Normal file
146
endpoint_storage/src/openapi_spec.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Endpoint Storage API
|
||||
description: Endpoint Storage API
|
||||
version: "1.0"
|
||||
license:
|
||||
name: "Apache"
|
||||
url: https://github.com/neondatabase/neon/blob/main/LICENSE
|
||||
servers:
|
||||
- url: ""
|
||||
paths:
|
||||
/status:
|
||||
description: Healthcheck endpoint
|
||||
get:
|
||||
description: Healthcheck
|
||||
security: []
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: key
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: "File stream from blob storage"
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
"400":
|
||||
description: File was not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
put:
|
||||
description: Insert file into blob storage. If file exists, override it
|
||||
requestBody:
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
responses:
|
||||
"200":
|
||||
description: File was inserted successfully
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
delete:
|
||||
description: Delete file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: File was successfully deleted or not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete endpoint data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Endpoint data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete timeline data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Timeline data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete tenant data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
#[default]
|
||||
@@ -58,7 +58,18 @@ pub enum LfcPrewarmState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
impl Display for LfcPrewarmState {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
#[default]
|
||||
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
/// Response of /promote
|
||||
pub enum PromoteState {
|
||||
NotPromoted,
|
||||
Completed,
|
||||
Failed { error: String },
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Result of /safekeepers_lsn
|
||||
pub struct SafekeepersLsn {
|
||||
pub safekeepers: String,
|
||||
pub wal_flush_lsn: utils::lsn::Lsn,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
|
||||
@@ -181,10 +181,14 @@ pub struct ComputeSpec {
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
/// Download LFC state from endpoint storage and pass it to Postgres on compute startup
|
||||
pub autoprewarm: bool,
|
||||
|
||||
#[serde(default)]
|
||||
/// Upload LFC state to endpoint storage periodically. Default value (None) means "don't upload"
|
||||
pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
|
||||
/// Suspend timeout in seconds.
|
||||
///
|
||||
/// We use this value to derive other values, such as the installed extensions metric.
|
||||
@@ -438,7 +442,7 @@ pub struct JwksSettings {
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
|
||||
@@ -90,6 +90,11 @@
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "offload_lfc_interval_seconds",
|
||||
"value": "20",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
|
||||
|
||||
@@ -7,7 +7,7 @@ use anyhow::{Context, anyhow};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::{Body, Method, Request, Response};
|
||||
use hyper::{Body, Request, Response};
|
||||
use jsonwebtoken::TokenData;
|
||||
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -18,8 +18,9 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::sync::{Mutex, Notify, mpsc};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{Instrument, debug, info, info_span, warn};
|
||||
use tracing::{Instrument, info, info_span, warn};
|
||||
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
|
||||
|
||||
use crate::error::{ApiError, api_error_handler, route_error_handler};
|
||||
use crate::request::{get_query_param, parse_query_param};
|
||||
@@ -80,14 +81,10 @@ where
|
||||
let path = request.uri().path();
|
||||
let request_span = info_span!("request", %method, %path, %request_id);
|
||||
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
info!("Handling request");
|
||||
|
||||
// No special handling for panics here. There's a `tracing_panic_hook` from another
|
||||
// module to do that globally.
|
||||
@@ -108,11 +105,7 @@ where
|
||||
match res {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
debug!("Request handled, status: {response_status}");
|
||||
} else {
|
||||
info!("Request handled, status: {response_status}");
|
||||
}
|
||||
info!("Request handled, status: {response_status}");
|
||||
Ok(response)
|
||||
}
|
||||
Err(err) => Ok(api_error_handler(err)),
|
||||
@@ -250,9 +243,28 @@ impl std::io::Write for ChannelWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub async fn prometheus_metrics_handler(
|
||||
req: Request<Body>,
|
||||
force_metric_collection_on_scrape: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
// HADRON
|
||||
let requested_use_latest = parse_query_param(&req, "use_latest")?;
|
||||
|
||||
let use_latest = match requested_use_latest {
|
||||
None => force_metric_collection_on_scrape,
|
||||
Some(true) => true,
|
||||
Some(false) => {
|
||||
if force_metric_collection_on_scrape {
|
||||
// We don't cache in this case
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
@@ -277,12 +289,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
|
||||
let _span = span.entered();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
// HADRON
|
||||
let collected = if use_latest {
|
||||
// Skip caching the results if we always force metric collection on scrape.
|
||||
METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
|
||||
} else {
|
||||
METRICS_COLLECTOR.last_collected()
|
||||
};
|
||||
|
||||
let gathered_at = std::time::Instant::now();
|
||||
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.encode(&collected.metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
// this instant is not when we finally got the full response sent, sending is done by hyper
|
||||
@@ -295,6 +313,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
let encoded_in = encoded_at - gathered_at - writer.wait_time();
|
||||
let total = encoded_at - started_at;
|
||||
|
||||
// HADRON
|
||||
let staleness_ms = (encoded_at - collected.collected_at).as_millis();
|
||||
METRICS_STALE_MILLIS.set(staleness_ms as i64);
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
@@ -303,6 +325,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
stalenss_ms = staleness_ms,
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -41,17 +41,35 @@ pub fn get_query_param<'a>(
|
||||
Some(q) => q,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let mut values = url::form_urlencoded::parse(query.as_bytes())
|
||||
let values = url::form_urlencoded::parse(query.as_bytes())
|
||||
.filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
|
||||
// we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
|
||||
.fuse();
|
||||
|
||||
let value1 = values.next();
|
||||
if values.next().is_some() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
)));
|
||||
}
|
||||
// Work around an issue with Alloy's pyroscope scrape where the "seconds"
|
||||
// parameter is added several times. https://github.com/grafana/alloy/issues/3026
|
||||
// TODO: revert after Alloy is fixed.
|
||||
let value1 = values
|
||||
.map(Ok)
|
||||
.reduce(|acc, i| {
|
||||
match acc {
|
||||
Err(_) => acc,
|
||||
|
||||
// It's okay to have duplicates as along as they have the same value.
|
||||
Ok(ref a) if a == &i.unwrap() => acc,
|
||||
|
||||
_ => Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
))),
|
||||
}
|
||||
})
|
||||
.transpose()?;
|
||||
// if values.next().is_some() {
|
||||
// return Err(ApiError::BadRequest(anyhow!(
|
||||
// "param {param_name} specified more than once"
|
||||
// )));
|
||||
// }
|
||||
|
||||
Ok(value1)
|
||||
}
|
||||
|
||||
@@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_query_param_duplicate() {
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert!(value.is_none());
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam");
|
||||
assert!(value.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ mod tests;
|
||||
|
||||
use const_format::formatcp;
|
||||
use posthog_client_lite::PostHogClientConfig;
|
||||
use utils::serde_percent::Percent;
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
@@ -223,8 +224,9 @@ pub struct ConfigToml {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
pub test_remote_failures: u64,
|
||||
pub test_remote_failures_probability: u64,
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub background_task_maximum_delay: Duration,
|
||||
@@ -270,9 +272,13 @@ pub struct ConfigToml {
|
||||
pub timeline_import_config: TimelineImportConfig,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub basebackup_cache_config: Option<BasebackupCacheConfig>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: utils::serde_percent::Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
@@ -283,6 +289,21 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
impl Default for DiskUsageEvictionTaskConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -543,6 +564,11 @@ pub struct TenantConfigToml {
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// HADRON
|
||||
// When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
|
||||
// (2) create image layers if there are any L1 deltas.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
@@ -738,9 +764,10 @@ impl Default for ConfigToml {
|
||||
|
||||
metric_collection_bucket: (None),
|
||||
|
||||
disk_usage_based_eviction: (None),
|
||||
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
||||
|
||||
test_remote_failures: (0),
|
||||
test_remote_failures_probability: (100),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||
|
||||
@@ -804,6 +831,8 @@ impl Default for ConfigToml {
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
posthog_config: None,
|
||||
image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
|
||||
force_metric_collection_on_scrape: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -897,6 +926,7 @@ impl Default for TenantConfigToml {
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
image_layer_force_creation_period: None,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
|
||||
@@ -386,6 +386,7 @@ pub enum NodeSchedulingPolicy {
|
||||
Pause,
|
||||
PauseForRestart,
|
||||
Draining,
|
||||
Deleting,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
@@ -398,6 +399,7 @@ impl FromStr for NodeSchedulingPolicy {
|
||||
"pause" => Ok(Self::Pause),
|
||||
"pause_for_restart" => Ok(Self::PauseForRestart),
|
||||
"draining" => Ok(Self::Draining),
|
||||
"deleting" => Ok(Self::Deleting),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
@@ -412,6 +414,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
Pause => "pause",
|
||||
PauseForRestart => "pause_for_restart",
|
||||
Draining => "draining",
|
||||
Deleting => "deleting",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
|
||||
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
|
||||
pub safekeepers: Vec<SafekeeperInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperInfo {
|
||||
pub id: NodeId,
|
||||
pub hostname: String,
|
||||
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
|
||||
pub gc_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_creation_threshold: FieldPatch<usize>,
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_layer_force_creation_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub pitr_interval: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
@@ -700,6 +703,11 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Option<Duration>,
|
||||
@@ -798,6 +806,7 @@ impl TenantConfig {
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
mut image_layer_force_creation_period,
|
||||
mut pitr_interval,
|
||||
mut walreceiver_connect_timeout,
|
||||
mut lagging_wal_timeout,
|
||||
@@ -861,6 +870,11 @@ impl TenantConfig {
|
||||
patch
|
||||
.image_creation_threshold
|
||||
.apply(&mut image_creation_threshold);
|
||||
// HADRON
|
||||
patch
|
||||
.image_layer_force_creation_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut image_layer_force_creation_period);
|
||||
patch
|
||||
.pitr_interval
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
@@ -942,6 +956,7 @@ impl TenantConfig {
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
image_layer_force_creation_period,
|
||||
pitr_interval,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
@@ -1016,6 +1031,9 @@ impl TenantConfig {
|
||||
image_creation_threshold: self
|
||||
.image_creation_threshold
|
||||
.unwrap_or(global_conf.image_creation_threshold),
|
||||
image_layer_force_creation_period: self
|
||||
.image_layer_force_creation_period
|
||||
.or(global_conf.image_layer_force_creation_period),
|
||||
pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
|
||||
walreceiver_connect_timeout: self
|
||||
.walreceiver_connect_timeout
|
||||
|
||||
@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
pub fn key_to_shard_number(
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
key: &Key,
|
||||
) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
|
||||
12
libs/proxy/json/Cargo.toml
Normal file
12
libs/proxy/json/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "json"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
ryu = "1"
|
||||
itoa = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
futures = "0.3"
|
||||
412
libs/proxy/json/src/lib.rs
Normal file
412
libs/proxy/json/src/lib.rs
Normal file
@@ -0,0 +1,412 @@
|
||||
//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
|
||||
//!
|
||||
//! Features:
|
||||
//!
|
||||
//! ## Dynamic construction
|
||||
//!
|
||||
//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
|
||||
//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
|
||||
//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
|
||||
//! but that is often not very efficient.
|
||||
//!
|
||||
//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
|
||||
//! relevant functions, and it will guarantee a correctly encoded JSON value.
|
||||
//!
|
||||
//! ## Async construction
|
||||
//!
|
||||
//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
|
||||
//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
|
||||
//! there are exceptions.
|
||||
//!
|
||||
//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
|
||||
//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
|
||||
//!
|
||||
//! ## Examples
|
||||
//!
|
||||
//! To represent the following JSON as a compact string
|
||||
//!
|
||||
//! ```json
|
||||
//! {
|
||||
//! "results": {
|
||||
//! "rows": [
|
||||
//! {
|
||||
//! "id": 1,
|
||||
//! "value": null
|
||||
//! },
|
||||
//! {
|
||||
//! "id": 2,
|
||||
//! "value": "hello"
|
||||
//! }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! We can use the following code:
|
||||
//!
|
||||
//! ```
|
||||
//! // create the outer object
|
||||
//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
|
||||
//! // create an entry with key "results" and start an object value associated with it.
|
||||
//! let results = v.key("results");
|
||||
//! json::value_as_object!(|results| {
|
||||
//! // create an entry with key "rows" and start an list value associated with it.
|
||||
//! let rows = results.key("rows");
|
||||
//! json::value_as_list!(|rows| {
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 1
|
||||
//! row.entry("id", 1);
|
||||
//! // add entry "value": null
|
||||
//! row.entry("value", json::Null);
|
||||
//! });
|
||||
//!
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 2
|
||||
//! row.entry("id", 2);
|
||||
//! // add entry "value": "hello"
|
||||
//! row.entry("value", "hello");
|
||||
//! });
|
||||
//! });
|
||||
//! });
|
||||
//! }));
|
||||
//!
|
||||
//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
|
||||
//! ```
|
||||
|
||||
mod macros;
|
||||
mod str;
|
||||
mod value;
|
||||
|
||||
pub use value::{Null, ValueEncoder};
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a single json value.
|
||||
pub struct ValueSer<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ValueSer<'buf> {
|
||||
/// Create a new json value serializer.
|
||||
pub fn new(buf: &'buf mut Vec<u8>) -> Self {
|
||||
Self { buf, start: 0 }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.buf
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn value(self, e: impl ValueEncoder) {
|
||||
e.encode(self);
|
||||
}
|
||||
|
||||
/// Write raw bytes to the buf. This must be already JSON encoded.
|
||||
#[inline]
|
||||
pub fn write_raw_json(self, data: &[u8]) {
|
||||
self.buf.extend_from_slice(data);
|
||||
self.finish();
|
||||
}
|
||||
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn object(self) -> ObjectSer<'buf> {
|
||||
ObjectSer::new(self)
|
||||
}
|
||||
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn list(self) -> ListSer<'buf> {
|
||||
ListSer::new(self)
|
||||
}
|
||||
|
||||
/// Finish the value ser.
|
||||
#[inline]
|
||||
fn finish(self) {
|
||||
// don't trigger the drop handler which triggers a rollback.
|
||||
// this won't cause memory leaks because `ValueSet` owns no allocations.
|
||||
std::mem::forget(self);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ValueSer<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.buf.truncate(self.start);
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ObjectSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ObjectSer<'buf> {
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'{');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
|
||||
#[inline]
|
||||
pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
|
||||
key.write_key(self)
|
||||
}
|
||||
|
||||
/// Write an entry (key-value pair) to the object.
|
||||
#[inline]
|
||||
pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
|
||||
self.key(key).value(val);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
// push key
|
||||
f(self.value.buf);
|
||||
// push value separator
|
||||
self.value.buf.push(b':');
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `{`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b'}');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
pub trait KeyEncoder {
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ListSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ListSer<'buf> {
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'[');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Write an value to the list.
|
||||
#[inline]
|
||||
pub fn push(&mut self, val: impl ValueEncoder) {
|
||||
self.entry().value(val);
|
||||
}
|
||||
|
||||
/// Start a new value entry in this list.
|
||||
#[inline]
|
||||
pub fn entry(&mut self) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `[`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b']');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{Null, ValueSer};
|
||||
|
||||
#[test]
|
||||
fn object() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
object.entry("foo", "bar");
|
||||
object.entry("baz", Null);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list() {
|
||||
let mut buf = vec![];
|
||||
let mut list = ValueSer::new(&mut buf).list();
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
list.finish();
|
||||
|
||||
assert_eq!(buf, br#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_macro() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
obj.entry("foo", "bar");
|
||||
obj.entry("baz", Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_macro() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_on_drop() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
|
||||
'cancel: {
|
||||
let nested_list = list.entry();
|
||||
crate::value_as_list!(|nested_list| {
|
||||
nested_list.entry().value(1);
|
||||
|
||||
assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
|
||||
if true {
|
||||
break 'cancel;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
assert_eq!(list.as_buffer(), br#"["bar""#);
|
||||
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_object() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
let entry = obj.key("1");
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = obj.key("2");
|
||||
let entry = {
|
||||
let mut nested_obj = entry.object();
|
||||
nested_obj.entry("foo", "bar");
|
||||
nested_obj.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"1":1,"2":2}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_list() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
let entry = list.entry();
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = list.entry();
|
||||
let entry = {
|
||||
let mut nested_list = entry.list();
|
||||
nested_list.push("foo");
|
||||
nested_list.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"[1,2]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_escaping() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
|
||||
let key = "hello";
|
||||
let value = "\n world";
|
||||
|
||||
object.entry(format_args!("{key:?}"), value);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
|
||||
}
|
||||
}
|
||||
86
libs/proxy/json/src/macros.rs
Normal file
86
libs/proxy/json/src/macros.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```
|
||||
//! use futures::{StreamExt, TryStream, TryStreamExt};
|
||||
//!
|
||||
//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
|
||||
//! where
|
||||
//! S: TryStream<Ok = T, Error = E> + Unpin,
|
||||
//! T: json::ValueEncoder
|
||||
//! {
|
||||
//! Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
|
||||
//! // note how we can use `.await` and `?` in here.
|
||||
//! while let Some(value) = s.try_next().await? {
|
||||
//! val.push(value);
|
||||
//! }
|
||||
//! })))
|
||||
//! }
|
||||
//!
|
||||
//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
|
||||
//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
|
||||
//! assert_eq!(json_string, "[1,2,3]");
|
||||
//! ```
|
||||
|
||||
/// A helper to create a new JSON vec.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_vec {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut buf = vec![];
|
||||
let $val = $crate::ValueSer::new(&mut buf);
|
||||
let _: () = $body;
|
||||
buf
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper to create a new JSON string.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_string {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
|
||||
.expect("json should be valid utf8")
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_object {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut obj = $crate::ObjectSer::new($val);
|
||||
|
||||
let $val = &mut obj;
|
||||
let res = $body;
|
||||
|
||||
obj.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_list {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut list = $crate::ListSer::new($val);
|
||||
|
||||
let $val = &mut list;
|
||||
let res = $body;
|
||||
|
||||
list.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
166
libs/proxy/json/src/str.rs
Normal file
166
libs/proxy/json/src/str.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
//! Helpers for serializing escaped strings.
|
||||
//!
|
||||
//! ## License
|
||||
//!
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
|
||||
//! Licensed by David Tolnay under MIT or Apache-2.0.
|
||||
//!
|
||||
//! With modifications by Conrad Ludgate on behalf of Databricks.
|
||||
|
||||
use std::fmt::{self, Write};
|
||||
|
||||
/// Represents a character escape code in a type-safe manner.
|
||||
pub enum CharEscape {
|
||||
/// An escaped quote `"`
|
||||
Quote,
|
||||
/// An escaped reverse solidus `\`
|
||||
ReverseSolidus,
|
||||
// /// An escaped solidus `/`
|
||||
// Solidus,
|
||||
/// An escaped backspace character (usually escaped as `\b`)
|
||||
Backspace,
|
||||
/// An escaped form feed character (usually escaped as `\f`)
|
||||
FormFeed,
|
||||
/// An escaped line feed character (usually escaped as `\n`)
|
||||
LineFeed,
|
||||
/// An escaped carriage return character (usually escaped as `\r`)
|
||||
CarriageReturn,
|
||||
/// An escaped tab character (usually escaped as `\t`)
|
||||
Tab,
|
||||
/// An escaped ASCII plane control character (usually escaped as
|
||||
/// `\u00XX` where `XX` are two hex characters)
|
||||
AsciiControl(u8),
|
||||
}
|
||||
|
||||
impl CharEscape {
|
||||
#[inline]
|
||||
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
|
||||
match escape {
|
||||
self::BB => CharEscape::Backspace,
|
||||
self::TT => CharEscape::Tab,
|
||||
self::NN => CharEscape::LineFeed,
|
||||
self::FF => CharEscape::FormFeed,
|
||||
self::RR => CharEscape::CarriageReturn,
|
||||
self::QU => CharEscape::Quote,
|
||||
self::BS => CharEscape::ReverseSolidus,
|
||||
self::UU => CharEscape::AsciiControl(byte),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
|
||||
writer.reserve(2 + value.len());
|
||||
|
||||
writer.push(b'"');
|
||||
|
||||
let rest = format_escaped_str_contents(writer, value);
|
||||
writer.extend_from_slice(rest);
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
|
||||
writer.push(b'"');
|
||||
|
||||
Collect { buf: writer }
|
||||
.write_fmt(args)
|
||||
.expect("formatting should not error");
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
struct Collect<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
}
|
||||
|
||||
impl fmt::Write for Collect<'_> {
|
||||
fn write_str(&mut self, s: &str) -> fmt::Result {
|
||||
let last = format_escaped_str_contents(self.buf, s);
|
||||
self.buf.extend(last);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// writes any escape sequences, and returns the suffix still needed to be written.
|
||||
fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
|
||||
let bytes = value.as_bytes();
|
||||
|
||||
let mut start = 0;
|
||||
|
||||
for (i, &byte) in bytes.iter().enumerate() {
|
||||
let escape = ESCAPE[byte as usize];
|
||||
if escape == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
writer.extend_from_slice(&bytes[start..i]);
|
||||
|
||||
let char_escape = CharEscape::from_escape_table(escape, byte);
|
||||
write_char_escape(writer, char_escape);
|
||||
|
||||
start = i + 1;
|
||||
}
|
||||
|
||||
&bytes[start..]
|
||||
}
|
||||
|
||||
const BB: u8 = b'b'; // \x08
|
||||
const TT: u8 = b't'; // \x09
|
||||
const NN: u8 = b'n'; // \x0A
|
||||
const FF: u8 = b'f'; // \x0C
|
||||
const RR: u8 = b'r'; // \x0D
|
||||
const QU: u8 = b'"'; // \x22
|
||||
const BS: u8 = b'\\'; // \x5C
|
||||
const UU: u8 = b'u'; // \x00...\x1F except the ones above
|
||||
const __: u8 = 0;
|
||||
|
||||
// Lookup table of escape sequences. A value of b'x' at index i means that byte
|
||||
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
|
||||
static ESCAPE: [u8; 256] = [
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
|
||||
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
|
||||
];
|
||||
|
||||
fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
|
||||
let s = match char_escape {
|
||||
CharEscape::Quote => b"\\\"",
|
||||
CharEscape::ReverseSolidus => b"\\\\",
|
||||
// CharEscape::Solidus => b"\\/",
|
||||
CharEscape::Backspace => b"\\b",
|
||||
CharEscape::FormFeed => b"\\f",
|
||||
CharEscape::LineFeed => b"\\n",
|
||||
CharEscape::CarriageReturn => b"\\r",
|
||||
CharEscape::Tab => b"\\t",
|
||||
CharEscape::AsciiControl(byte) => {
|
||||
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
|
||||
let bytes = &[
|
||||
b'\\',
|
||||
b'u',
|
||||
b'0',
|
||||
b'0',
|
||||
HEX_DIGITS[(byte >> 4) as usize],
|
||||
HEX_DIGITS[(byte & 0xF) as usize],
|
||||
];
|
||||
return writer.extend_from_slice(bytes);
|
||||
}
|
||||
};
|
||||
|
||||
writer.extend_from_slice(s);
|
||||
}
|
||||
168
libs/proxy/json/src/value.rs
Normal file
168
libs/proxy/json/src/value.rs
Normal file
@@ -0,0 +1,168 @@
|
||||
use core::fmt;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
use crate::str::{format_escaped_fmt, format_escaped_str};
|
||||
use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
|
||||
|
||||
/// Write a value to the underlying json representation.
|
||||
pub trait ValueEncoder {
|
||||
fn encode(self, v: ValueSer<'_>);
|
||||
}
|
||||
|
||||
pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &T {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
T::encode(*self, v);
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for &str {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
format_escaped_str(v.buf, self);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
if let Some(s) = self.as_str() {
|
||||
format_escaped_str(v.buf, s);
|
||||
} else {
|
||||
format_escaped_fmt(v.buf, self);
|
||||
}
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! int {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_int(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
int![u8, u16, u32, u64, usize, u128];
|
||||
int![i8, i16, i32, i64, isize, i128];
|
||||
|
||||
macro_rules! float {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_float(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
float![f32, f64];
|
||||
|
||||
impl ValueEncoder for bool {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(if self { b"true" } else { b"false" });
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Option<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
match self {
|
||||
Some(value) => value.encode(v),
|
||||
None => Null.encode(v),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for &str {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
let obj = &mut *obj;
|
||||
obj.entry_inner(|b| format_escaped_str(b, self))
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
if let Some(key) = self.as_str() {
|
||||
obj.entry_inner(|b| format_escaped_str(b, key))
|
||||
} else {
|
||||
obj.entry_inner(|b| format_escaped_fmt(b, self))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the JSON null value.
|
||||
pub struct Null;
|
||||
|
||||
impl ValueEncoder for Null {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(b"null");
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Vec<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
humantime-serde.workspace = true
|
||||
@@ -41,6 +42,9 @@ http-body-util.workspace = true
|
||||
itertools.workspace = true
|
||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
|
||||
byteorder = "1.4"
|
||||
rand = "0.8.5"
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
|
||||
@@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow};
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::operations::GetBlobBuilder;
|
||||
use azure_storage_blobs::blob::BlobBlockType;
|
||||
use azure_storage_blobs::blob::BlockList;
|
||||
use azure_storage_blobs::blob::{Blob, CopyStatus};
|
||||
use azure_storage_blobs::container::operations::ListBlobsBuilder;
|
||||
use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use base64::{Engine as _, engine::general_purpose::URL_SAFE};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::FutureExt;
|
||||
use futures::future::Either;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
@@ -51,6 +59,9 @@ pub struct AzureBlobStorage {
|
||||
|
||||
// Alternative timeout used for metadata objects which are expected to be small
|
||||
pub small_timeout: Duration,
|
||||
/* BEGIN_HADRON */
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
@@ -107,6 +118,9 @@ impl AzureBlobStorage {
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
small_timeout,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: azure_config.put_block_size_mb,
|
||||
/* END_HADRON */
|
||||
})
|
||||
}
|
||||
|
||||
@@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let op = async {
|
||||
let mut metadata_map = metadata.unwrap_or([].into());
|
||||
let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block");
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let op = async move {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024;
|
||||
if timeline_file_path.is_none() || put_block_size == 0 {
|
||||
// Use put_block_blob directly.
|
||||
let from: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(from);
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
match result {
|
||||
Ok(Ok(_response)) => return Ok(()),
|
||||
Ok(Err(azure)) => return Err(azure.into()),
|
||||
Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
};
|
||||
}
|
||||
// Upload chunks concurrently using Put Block.
|
||||
// Each PutBlock uploads put_block_size bytes of the file.
|
||||
let mut upload_futures: Vec<tokio::task::JoinHandle<Result<(), azure_core::Error>>> =
|
||||
vec![];
|
||||
let mut block_list = BlockList::default();
|
||||
let mut start_bytes = 0u64;
|
||||
let mut remaining_bytes = data_size_bytes;
|
||||
let mut block_list_count = 0;
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
while remaining_bytes > 0 {
|
||||
let block_size = std::cmp::min(remaining_bytes, put_block_size);
|
||||
let end_bytes = start_bytes + block_size as u64;
|
||||
let block_id = block_list_count;
|
||||
let timeout = self.timeout;
|
||||
let blob_client = blob_client.clone();
|
||||
let timeline_file = timeline_file_path.clone().unwrap().clone();
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
let mut encoded_block_id = [0u8; 8];
|
||||
BigEndian::write_u64(&mut encoded_block_id, block_id);
|
||||
URL_SAFE.encode(encoded_block_id);
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
// Put one block.
|
||||
let part_fut = async move {
|
||||
let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?;
|
||||
file.seek(io::SeekFrom::Start(start_bytes)).await?;
|
||||
let limited_reader = file.take(block_size as u64);
|
||||
let file_chunk_stream =
|
||||
tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024);
|
||||
let file_chunk_stream_pin: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(file_chunk_stream);
|
||||
let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper));
|
||||
// Azure put block takes URL-encoded block ids and all blocks must have the same byte length.
|
||||
// https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters
|
||||
let builder = blob_client.put_block(encoded_block_id.to_vec(), body);
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!(
|
||||
"azure put block id-{} size {} start {} end {} file {} response {:#?}",
|
||||
block_id,
|
||||
block_size,
|
||||
start_bytes,
|
||||
end_bytes,
|
||||
timeline_file,
|
||||
result
|
||||
);
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure),
|
||||
Err(_timeout) => Err(azure_core::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::TimedOut,
|
||||
"Operation timed out",
|
||||
),
|
||||
)),
|
||||
}
|
||||
};
|
||||
upload_futures.push(tokio::spawn(part_fut));
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
block_list_count += 1;
|
||||
remaining_bytes -= block_size;
|
||||
start_bytes += block_size as u64;
|
||||
|
||||
block_list
|
||||
.blocks
|
||||
.push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into()));
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"azure put blocks {} total MB: {} chunk size MB: {}",
|
||||
block_list_count,
|
||||
data_size_bytes / 1024 / 1024,
|
||||
put_block_size / 1024 / 1024
|
||||
);
|
||||
// Wait for all blocks to be uploaded.
|
||||
let upload_results = futures::future::try_join_all(upload_futures).await;
|
||||
if upload_results.is_err() {
|
||||
return Err(anyhow::anyhow!(format!(
|
||||
"Failed to upload all blocks {:#?}",
|
||||
upload_results.unwrap_err()
|
||||
)));
|
||||
}
|
||||
|
||||
// Commit the blocks.
|
||||
let mut builder = blob_client.put_block_list(block_list);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!("azure put block list response {:#?}", result);
|
||||
|
||||
match fut.await {
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
@@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
|
||||
@@ -195,8 +195,19 @@ pub struct AzureConfig {
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(default = "default_azure_conn_pool_size")]
|
||||
pub conn_pool_size: usize,
|
||||
/* BEGIN_HADRON */
|
||||
#[serde(default = "default_azure_put_block_size_mb")]
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
fn default_azure_put_block_size_mb() -> Option<usize> {
|
||||
// Disable parallel upload by default.
|
||||
Some(0)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
@@ -213,6 +224,9 @@ impl Debug for AzureConfig {
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
/* BEGIN_HADRON */
|
||||
.field("put_block_size_mb", &self.put_block_size_mb)
|
||||
/* END_HADRON */
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -352,6 +366,7 @@ timeout = '5s'";
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
conn_pool_size = 8
|
||||
put_block_size_mb = 1024
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap();
|
||||
@@ -367,6 +382,9 @@ timeout = '5s'";
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1024),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: Duration::from_secs(7),
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
|
||||
|
||||
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
||||
/* BEGIN_HADRON */
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(
|
||||
s,
|
||||
fail_first,
|
||||
fail_probability,
|
||||
)))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||
pub async fn upload_storage_object(
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use rand::Rng;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::num::NonZeroU32;
|
||||
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
|
||||
|
||||
// Tracks how many failed attempts of each operation has been made.
|
||||
attempts: Mutex<HashMap<RemoteOp, u64>>,
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// This the probability of failure for each operation, ranged from [0, 100].
|
||||
// The probability is default to 100, which means that all operations will fail.
|
||||
attempt_failure_probability: u64,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/// Used to identify retries of different unique operation.
|
||||
@@ -40,7 +48,11 @@ enum RemoteOp {
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
pub fn new(
|
||||
inner: crate::GenericRemoteStorage,
|
||||
attempts_to_fail: u64,
|
||||
attempt_failure_probability: u64,
|
||||
) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
attempt_failure_probability: actual_attempt_failure_probability,
|
||||
attempts: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
|
||||
///
|
||||
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
||||
let mut attempts = self.attempts.lock().unwrap();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
match attempts.entry(op) {
|
||||
Entry::Occupied(mut e) => {
|
||||
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
|
||||
*p
|
||||
};
|
||||
|
||||
if attempts_before_this >= self.attempts_to_fail {
|
||||
// let it succeed
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
} else {
|
||||
/* BEGIN_HADRON */
|
||||
// If there are more attempts to fail, fail the request by probability.
|
||||
if (attempts_before_this < self.attempts_to_fail)
|
||||
&& (rng.gen_range(0..=100) < self.attempt_failure_probability)
|
||||
{
|
||||
let error =
|
||||
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
Err(error)
|
||||
} else {
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
|
||||
@@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data(
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) {
|
||||
let file_path = "/tmp/dbx_upload_tmp_file.txt";
|
||||
{
|
||||
// Open the file in append mode
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.append(true)
|
||||
.create(true) // Create the file if it doesn't exist
|
||||
.open(file_path)?;
|
||||
// Append some bytes to the file
|
||||
std::io::Write::write_all(
|
||||
&mut file,
|
||||
&format!("remote blob data {i}").into_bytes(),
|
||||
)?;
|
||||
file.sync_all()?;
|
||||
}
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
file_path,
|
||||
)]));
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
task_client
|
||||
.upload(data, data_len, &blob_path, None, &cancel)
|
||||
.upload(data, data_len, &blob_path, metadata, &cancel)
|
||||
.await?;
|
||||
|
||||
// TODO: Check upload is using the put_block upload.
|
||||
// We cannot consume data here since data is moved inside the upload.
|
||||
// let total_bytes = data.fold(0, |acc, chunk| async move {
|
||||
// acc + chunk.map(|bytes| bytes.len()).unwrap_or(0)
|
||||
// }).await;
|
||||
// assert_eq!(total_bytes, data_len);
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
@@ -219,6 +219,9 @@ async fn create_azure_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
|
||||
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
pub term: Term,
|
||||
pub last_log_term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
}
|
||||
|
||||
|
||||
@@ -44,3 +44,62 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub enum DeploymentMode {
|
||||
Dev,
|
||||
Staging,
|
||||
Prod,
|
||||
}
|
||||
|
||||
pub fn get_deployment_mode() -> Option<DeploymentMode> {
|
||||
match std::env::var("DEPLOYMENT_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"development" => Some(DeploymentMode::Dev),
|
||||
"staging" => Some(DeploymentMode::Staging),
|
||||
"production" => Some(DeploymentMode::Prod),
|
||||
_ => {
|
||||
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("DEPLOYMENT_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_dev_or_staging() -> bool {
|
||||
matches!(
|
||||
get_deployment_mode(),
|
||||
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
|
||||
)
|
||||
}
|
||||
|
||||
pub enum TestingMode {
|
||||
Chaos,
|
||||
Stress,
|
||||
}
|
||||
|
||||
pub fn get_test_mode() -> Option<TestingMode> {
|
||||
match std::env::var("HADRON_TEST_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"chaos" => Some(TestingMode::Chaos),
|
||||
"stress" => Some(TestingMode::Stress),
|
||||
_ => {
|
||||
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("HADRON_TEST_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_chaos_testing() -> bool {
|
||||
matches!(get_test_mode(), Some(TestingMode::Chaos))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
@@ -99,6 +99,8 @@ pub mod elapsed_accum;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod linux_socket_ioctl;
|
||||
|
||||
pub mod metrics_collector;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
75
libs/utils/src/metrics_collector.rs
Normal file
75
libs/utils/src/metrics_collector.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::{
|
||||
sync::{Arc, RwLock},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"metrics_metrics_stale_milliseconds",
|
||||
"The current metrics stale time in milliseconds"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CollectedMetrics {
|
||||
pub metrics: Vec<MetricFamily>,
|
||||
pub collected_at: Instant,
|
||||
}
|
||||
|
||||
impl CollectedMetrics {
|
||||
fn new(metrics: Vec<MetricFamily>) -> Self {
|
||||
Self {
|
||||
metrics,
|
||||
collected_at: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricsCollector {
|
||||
last_collected: RwLock<Arc<CollectedMetrics>>,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(name = "metrics_collector", skip_all)]
|
||||
pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
|
||||
let started = Instant::now();
|
||||
let metrics = metrics::gather();
|
||||
let collected = Arc::new(CollectedMetrics::new(metrics));
|
||||
if cache_metrics {
|
||||
let mut guard = self.last_collected.write().unwrap();
|
||||
*guard = collected.clone();
|
||||
}
|
||||
tracing::info!(
|
||||
"Collected {} metric families in {} ms",
|
||||
collected.metrics.len(),
|
||||
started.elapsed().as_millis()
|
||||
);
|
||||
collected
|
||||
}
|
||||
|
||||
pub fn last_collected(&self) -> Arc<CollectedMetrics> {
|
||||
self.last_collected.read().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MetricsCollector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
|
||||
pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
|
||||
|
||||
pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
|
||||
@@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardCount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardSlug<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
|
||||
@@ -112,6 +112,7 @@ twox-hash.workspace = true
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
base64.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
23
pageserver/client_grpc/Cargo.toml
Normal file
23
pageserver/client_grpc/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = ["pageserver_api/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
compute_api.workspace = true
|
||||
futures.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
403
pageserver/client_grpc/src/client.rs
Normal file
403
pageserver/client_grpc/src/client.rs
Normal file
@@ -0,0 +1,403 @@
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZero;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
|
||||
/// when full.
|
||||
///
|
||||
/// TODO: tune all of these constants, and consider making them configurable.
|
||||
/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
|
||||
/// with only streams.
|
||||
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// Max number of concurrent unary request clients per shard.
|
||||
const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
|
||||
/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
|
||||
const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of pipelined requests per stream.
|
||||
const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
|
||||
|
||||
/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
|
||||
/// are more throughput-oriented, we have a smaller limit but higher queue depth.
|
||||
const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
|
||||
/// get a larger queue depth.
|
||||
const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
|
||||
|
||||
/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
|
||||
/// basic `page_api::Client` gRPC client, and supports:
|
||||
///
|
||||
/// * Sharded tenants across multiple Pageservers.
|
||||
/// * Pooling of connections, clients, and streams for efficient resource use.
|
||||
/// * Concurrent use by many callers.
|
||||
/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
|
||||
/// * Automatic retries.
|
||||
/// * Observability.
|
||||
///
|
||||
/// TODO: this client does not support base backups or LSN leases, as these are only used by
|
||||
/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
|
||||
pub struct PageserverClient {
|
||||
// TODO: support swapping out the shard map, e.g. via an ArcSwap.
|
||||
shards: Shards,
|
||||
retry: Retry,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
|
||||
/// in the shard map, which must be complete and must use gRPC URLs.
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
stripe_size: ShardStripeSize,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
|
||||
Ok(Self {
|
||||
shards,
|
||||
retry: Retry,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn check_rel_exists(
|
||||
&self,
|
||||
req: page_api::CheckRelExistsRequest,
|
||||
) -> tonic::Result<page_api::CheckRelExistsResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.check_rel_exists(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
#[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
req: page_api::GetDbSizeRequest,
|
||||
) -> tonic::Result<page_api::GetDbSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_db_size(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
|
||||
/// splits requests that straddle shard boundaries, and assembles the responses.
|
||||
///
|
||||
/// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
|
||||
/// errors. All responses will have `GetPageStatusCode::Ok`.
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
class = %req.request_class,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
pub async fn get_page(
|
||||
&self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Make sure we have at least one page.
|
||||
if req.block_numbers.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("no block number"));
|
||||
}
|
||||
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
|
||||
{
|
||||
return self.get_page_for_shard(shard_id, req).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
//
|
||||
// TODO: when we support shard map updates, we need to detect when it changes and re-split
|
||||
// the request on errors.
|
||||
let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
|
||||
|
||||
let mut shard_requests: FuturesUnordered<_> = splitter
|
||||
.drain_requests()
|
||||
.map(|(shard_id, shard_req)| {
|
||||
// NB: each request will retry internally.
|
||||
self.get_page_for_shard(shard_id, shard_req)
|
||||
.map(move |result| result.map(|resp| (shard_id, resp)))
|
||||
})
|
||||
.collect();
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter.assemble_response()
|
||||
}
|
||||
|
||||
/// Fetches pages that belong to the given shard.
|
||||
#[instrument(skip_all, fields(shard = %shard_id))]
|
||||
async fn get_page_for_shard(
|
||||
&self,
|
||||
shard_id: ShardIndex,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let resp = self
|
||||
.retry
|
||||
.with(async || {
|
||||
let stream = self
|
||||
.shards
|
||||
.get(shard_id)?
|
||||
.stream(req.request_class.is_bulk())
|
||||
.await;
|
||||
let resp = stream.send(req.clone()).await?;
|
||||
|
||||
// Convert per-request errors into a tonic::Status.
|
||||
if resp.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(tonic::Status::new(
|
||||
resp.status_code.into(),
|
||||
resp.reason.unwrap_or_else(|| String::from("unknown error")),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Make sure we got the right number of pages.
|
||||
// NB: check outside of the retry loop, since we don't want to retry this.
|
||||
let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
|
||||
if expected != actual {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"expected {expected} pages for shard {shard_id}, got {actual}",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
req: page_api::GetRelSizeRequest,
|
||||
) -> tonic::Result<page_api::GetRelSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_rel_size(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
#[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
|
||||
pub async fn get_slru_segment(
|
||||
&self,
|
||||
req: page_api::GetSlruSegmentRequest,
|
||||
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_slru_segment(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks the tenant's shards.
|
||||
struct Shards {
|
||||
/// The shard count.
|
||||
///
|
||||
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
|
||||
count: ShardCount,
|
||||
/// The stripe size. Only used for sharded tenants.
|
||||
stripe_size: ShardStripeSize,
|
||||
/// Shards by shard index.
|
||||
///
|
||||
/// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
|
||||
///
|
||||
/// INVARIANT: every shard 0..count is present.
|
||||
/// INVARIANT: shard 0 is always present.
|
||||
map: HashMap<ShardIndex, Shard>,
|
||||
}
|
||||
|
||||
impl Shards {
|
||||
/// Creates a new set of shards based on a shard map.
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
stripe_size: ShardStripeSize,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let count = match shard_map.len() {
|
||||
0 => return Err(anyhow!("no shards provided")),
|
||||
1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
|
||||
n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
|
||||
n => ShardCount::new(n as u8),
|
||||
};
|
||||
|
||||
let mut map = HashMap::new();
|
||||
for (shard_id, url) in shard_map {
|
||||
// The shard index must match the computed shard count, even for unsharded tenants.
|
||||
if shard_id.shard_count != count {
|
||||
return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
|
||||
}
|
||||
// The shard index' number and count must be consistent.
|
||||
if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
|
||||
return Err(anyhow!("invalid shard index {shard_id}"));
|
||||
}
|
||||
// The above conditions guarantee that we have all shards 0..count: len() matches count,
|
||||
// shard number < count, and numbers are unique (via hashmap).
|
||||
let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
|
||||
map.insert(shard_id, shard);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
count,
|
||||
stripe_size,
|
||||
map,
|
||||
})
|
||||
}
|
||||
|
||||
/// Looks up the given shard.
|
||||
#[allow(clippy::result_large_err)] // TODO: check perf impact
|
||||
fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
|
||||
self.map
|
||||
.get(&shard_id)
|
||||
.ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
|
||||
}
|
||||
|
||||
/// Returns shard 0.
|
||||
fn get_zero(&self) -> &Shard {
|
||||
self.get(ShardIndex::new(ShardNumber(0), self.count))
|
||||
.expect("always present")
|
||||
}
|
||||
}
|
||||
|
||||
/// A single shard. Uses dedicated resource pools with the following structure:
|
||||
///
|
||||
/// * Channel pool: unbounded.
|
||||
/// * Unary client pool: MAX_UNARY_CLIENTS.
|
||||
/// * Stream client pool: unbounded.
|
||||
/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
|
||||
/// * Bulk channel pool: unbounded.
|
||||
/// * Bulk client pool: unbounded.
|
||||
/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
|
||||
struct Shard {
|
||||
/// Unary gRPC client pool.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// GetPage stream pool.
|
||||
stream_pool: Arc<StreamPool>,
|
||||
/// GetPage stream pool for bulk requests, e.g. prefetches.
|
||||
bulk_stream_pool: Arc<StreamPool>,
|
||||
}
|
||||
|
||||
impl Shard {
|
||||
/// Creates a new shard. It has its own dedicated resource pools.
|
||||
fn new(
|
||||
url: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Sanity-check that the URL uses gRPC.
|
||||
if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
|
||||
return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
|
||||
}
|
||||
|
||||
// Common channel pool for unary and stream requests. Bounded by client/stream pools.
|
||||
let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
|
||||
|
||||
// Client pool for unary requests.
|
||||
let client_pool = ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
Some(MAX_UNARY_CLIENTS),
|
||||
);
|
||||
|
||||
// GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
|
||||
// but shares a channel pool with it (as it's unbounded).
|
||||
let stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_STREAMS),
|
||||
MAX_STREAM_QUEUE_DEPTH,
|
||||
);
|
||||
|
||||
// Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
|
||||
// to avoid head-of-line blocking of latency-sensitive requests.
|
||||
let bulk_stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_BULK_STREAMS),
|
||||
MAX_BULK_STREAM_QUEUE_DEPTH,
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
client_pool,
|
||||
stream_pool,
|
||||
bulk_stream_pool,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a pooled client for this shard.
|
||||
async fn client(&self) -> tonic::Result<ClientGuard> {
|
||||
self.client_pool
|
||||
.get()
|
||||
.await
|
||||
.map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
|
||||
}
|
||||
|
||||
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
|
||||
/// pool (e.g. for prefetches).
|
||||
async fn stream(&self, bulk: bool) -> StreamGuard {
|
||||
match bulk {
|
||||
false => self.stream_pool.get().await,
|
||||
true => self.bulk_stream_pool.get().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
6
pageserver/client_grpc/src/lib.rs
Normal file
6
pageserver/client_grpc/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::PageserverClient;
|
||||
761
pageserver/client_grpc/src/pool.rs
Normal file
761
pageserver/client_grpc/src/pool.rs
Normal file
@@ -0,0 +1,761 @@
|
||||
//! This module provides various Pageserver gRPC client resource pools.
|
||||
//!
|
||||
//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
|
||||
//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency
|
||||
//! of creating dedicated TCP connections and server tasks for every Postgres backend.
|
||||
//!
|
||||
//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
|
||||
//! resource -- they are different enough that a generic pool isn't suitable.
|
||||
//!
|
||||
//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
|
||||
//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
|
||||
//! per-channel client limit. Channels may be closed when they are no longer used by any clients.
|
||||
//!
|
||||
//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
|
||||
//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
|
||||
//! from the pool after some time, to free up the channel.
|
||||
//!
|
||||
//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
|
||||
//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
|
||||
//! returns a guard that can be used to send a single request, to properly enforce queue depth and
|
||||
//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
|
||||
//! possibly pipelining multiple requests from multiple callers on the same stream (up to some
|
||||
//! queue depth). Idle streams may be removed from the pool after a while to free up the client.
|
||||
//!
|
||||
//! Each channel corresponds to one TCP connection. Each client unary request and each stream
|
||||
//! corresponds to one HTTP/2 stream and server task.
|
||||
//!
|
||||
//! TODO: error handling (including custom error types).
|
||||
//! TODO: observability.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::num::NonZero;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use futures::StreamExt as _;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tracing::{error, warn};
|
||||
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// Reap channels/clients/streams that have been idle for this long.
|
||||
///
|
||||
/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
|
||||
/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
|
||||
/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
|
||||
/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
|
||||
/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
|
||||
/// channels, and/or stream pool clients.
|
||||
const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
|
||||
false => Duration::from_secs(180),
|
||||
true => Duration::from_secs(1), // exercise reaping in tests
|
||||
};
|
||||
|
||||
/// Reap idle resources with this interval.
|
||||
const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
|
||||
false => Duration::from_secs(10),
|
||||
true => Duration::from_secs(1), // exercise reaping in tests
|
||||
};
|
||||
|
||||
/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
|
||||
/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
|
||||
/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
|
||||
/// `StreamPool` to limit the number of concurrent clients.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
///
|
||||
/// TODO: consider prewarming a set of channels, to avoid initial connection latency.
|
||||
/// TODO: consider adding a circuit breaker for errors and fail fast.
|
||||
pub struct ChannelPool {
|
||||
/// Pageserver endpoint to connect to.
|
||||
endpoint: Endpoint,
|
||||
/// Max number of clients per channel. Beyond this, a new channel will be created.
|
||||
max_clients_per_channel: NonZero<usize>,
|
||||
/// Open channels.
|
||||
channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
|
||||
/// Reaps idle channels.
|
||||
idle_reaper: Reaper,
|
||||
/// Channel ID generator.
|
||||
next_channel_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type ChannelID = usize;
|
||||
|
||||
struct ChannelEntry {
|
||||
/// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
|
||||
channel: Channel,
|
||||
/// Number of clients using this channel.
|
||||
clients: usize,
|
||||
/// The channel has been idle (no clients) since this time. None if channel is in use.
|
||||
/// INVARIANT: Some if clients == 0, otherwise None.
|
||||
idle_since: Option<Instant>,
|
||||
}
|
||||
|
||||
impl ChannelPool {
|
||||
/// Creates a new channel pool for the given Pageserver endpoint.
|
||||
pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
|
||||
where
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
let pool = Arc::new(Self {
|
||||
endpoint: endpoint.try_into()?,
|
||||
max_clients_per_channel,
|
||||
channels: Mutex::default(),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
next_channel_id: AtomicUsize::default(),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
Ok(pool)
|
||||
}
|
||||
|
||||
/// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
|
||||
///
|
||||
/// This never blocks (except for mutex acquisition). The channel is connected lazily on first
|
||||
/// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established
|
||||
/// automatically on failure (TODO: verify).
|
||||
///
|
||||
/// Callers should not clone the returned channel, and must hold onto the returned guard as long
|
||||
/// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
|
||||
/// client requires an owned `Channel` and we don't have access to the channel's internal
|
||||
/// refcount.
|
||||
///
|
||||
/// This is not performance-sensitive. It is only called when creating a new client, and clients
|
||||
/// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n)
|
||||
/// performance is therefore okay.
|
||||
pub fn get(self: &Arc<Self>) -> ChannelGuard {
|
||||
let mut channels = self.channels.lock().unwrap();
|
||||
|
||||
// Try to find an existing channel with available capacity. We check entries in BTreeMap
|
||||
// order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
|
||||
// with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// channels, and free up higher-ordered channels such that they can be reaped.
|
||||
for (&id, entry) in channels.iter_mut() {
|
||||
assert!(
|
||||
entry.clients <= self.max_clients_per_channel.get(),
|
||||
"channel overflow"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.idle_since.is_some(),
|
||||
entry.clients == 0,
|
||||
"incorrect channel idle state"
|
||||
);
|
||||
if entry.clients < self.max_clients_per_channel.get() {
|
||||
entry.clients += 1;
|
||||
entry.idle_since = None;
|
||||
return ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
channel: Some(entry.channel.clone()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new channel. We connect lazily on first use, such that we don't block here and
|
||||
// other clients can join onto the same channel while it's connecting.
|
||||
let channel = self.endpoint.connect_lazy();
|
||||
|
||||
let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed);
|
||||
let entry = ChannelEntry {
|
||||
channel: channel.clone(),
|
||||
clients: 1, // account for the guard below
|
||||
idle_since: None,
|
||||
};
|
||||
channels.insert(id, entry);
|
||||
|
||||
ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
channel: Some(channel),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for ChannelPool {
|
||||
/// Reaps channels that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.channels.lock().unwrap().retain(|_, entry| {
|
||||
let Some(idle_since) = entry.idle_since else {
|
||||
assert_ne!(entry.clients, 0, "empty channel not marked idle");
|
||||
return true;
|
||||
};
|
||||
assert_eq!(entry.clients, 0, "idle channel has clients");
|
||||
idle_since >= cutoff
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
|
||||
/// since the gRPC client requires an owned `Channel`.
|
||||
pub struct ChannelGuard {
|
||||
pool: Weak<ChannelPool>,
|
||||
id: ChannelID,
|
||||
channel: Option<Channel>,
|
||||
}
|
||||
|
||||
impl ChannelGuard {
|
||||
/// Returns the inner owned channel. Panics if called more than once. The caller must hold onto
|
||||
/// the guard as long as the channel is in use, and should not clone it.
|
||||
pub fn take(&mut self) -> Channel {
|
||||
self.channel.take().expect("channel already taken")
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the channel to the pool.
|
||||
impl Drop for ChannelGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
let mut channels = pool.channels.lock().unwrap();
|
||||
let entry = channels.get_mut(&self.id).expect("unknown channel");
|
||||
assert!(entry.idle_since.is_none(), "active channel marked idle");
|
||||
assert!(entry.clients > 0, "channel underflow");
|
||||
entry.clients -= 1;
|
||||
if entry.clients == 0 {
|
||||
entry.idle_since = Some(Instant::now()); // mark channel as idle
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
|
||||
/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
|
||||
/// number of concurrent clients to `max_clients` via semaphore.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
pub struct ClientPool {
|
||||
/// Tenant ID.
|
||||
tenant_id: TenantId,
|
||||
/// Timeline ID.
|
||||
timeline_id: TimelineId,
|
||||
/// Shard ID.
|
||||
shard_id: ShardIndex,
|
||||
/// Authentication token, if any.
|
||||
auth_token: Option<String>,
|
||||
/// Channel pool to acquire channels from.
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
/// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Idle pooled clients. Acquired clients are removed from here and returned on drop.
|
||||
///
|
||||
/// The first client in the map will be acquired next. The map is sorted by client ID, which in
|
||||
/// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
|
||||
/// clients are reaped.
|
||||
idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
|
||||
/// Reaps idle clients.
|
||||
idle_reaper: Reaper,
|
||||
/// Unique client ID generator.
|
||||
next_client_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type ClientID = (ChannelID, usize);
|
||||
|
||||
struct ClientEntry {
|
||||
/// The pooled gRPC client.
|
||||
client: page_api::Client,
|
||||
/// The channel guard for the channel used by the client.
|
||||
channel_guard: ChannelGuard,
|
||||
/// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
|
||||
/// definition, so this is the time when it was added back to the pool.
|
||||
idle_since: Instant,
|
||||
}
|
||||
|
||||
impl ClientPool {
|
||||
/// Creates a new client pool for the given tenant shard. Channels are acquired from the given
|
||||
/// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
|
||||
/// `max_clients` concurrent clients, or unbounded if None.
|
||||
pub fn new(
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
max_clients: Option<NonZero<usize>>,
|
||||
) -> Arc<Self> {
|
||||
let pool = Arc::new(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
channel_pool,
|
||||
idle: Mutex::default(),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
|
||||
next_client_id: AtomicUsize::default(),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
pool
|
||||
}
|
||||
|
||||
/// Gets a client from the pool, or creates a new one if necessary. Connections are established
|
||||
/// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
|
||||
/// is returned to the pool when the guard is dropped.
|
||||
///
|
||||
/// This is moderately performance-sensitive. It is called for every unary request, but these
|
||||
/// establish a new gRPC stream per request so they're already expensive. GetPage requests use
|
||||
/// the `StreamPool` instead.
|
||||
pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
|
||||
// Fast path: acquire an idle client from the pool.
|
||||
if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
|
||||
return Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
client: Some(entry.client),
|
||||
channel_guard: Some(entry.channel_guard),
|
||||
permit,
|
||||
});
|
||||
}
|
||||
|
||||
// Slow path: construct a new client.
|
||||
let mut channel_guard = self.channel_pool.get();
|
||||
let client = page_api::Client::new(
|
||||
channel_guard.take(),
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.shard_id,
|
||||
self.auth_token.clone(),
|
||||
None,
|
||||
)?;
|
||||
|
||||
Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id: (
|
||||
channel_guard.id,
|
||||
self.next_client_id.fetch_add(1, Ordering::Relaxed),
|
||||
),
|
||||
client: Some(client),
|
||||
channel_guard: Some(channel_guard),
|
||||
permit,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for ClientPool {
|
||||
/// Reaps clients that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.retain(|_, entry| entry.idle_since >= cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
/// A client acquired from the pool. The inner client can be accessed via Deref. The client is
|
||||
/// returned to the pool when dropped.
|
||||
pub struct ClientGuard {
|
||||
pool: Weak<ClientPool>,
|
||||
id: ClientID,
|
||||
client: Option<page_api::Client>, // Some until dropped
|
||||
channel_guard: Option<ChannelGuard>, // Some until dropped
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl Deref for ClientGuard {
|
||||
type Target = page_api::Client;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.client.as_ref().expect("not dropped")
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for ClientGuard {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.client.as_mut().expect("not dropped")
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the client to the pool.
|
||||
impl Drop for ClientGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
let entry = ClientEntry {
|
||||
client: self.client.take().expect("dropped once"),
|
||||
channel_guard: self.channel_guard.take().expect("dropped once"),
|
||||
idle_since: Instant::now(),
|
||||
};
|
||||
pool.idle.lock().unwrap().insert(self.id, entry);
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
|
||||
/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
|
||||
/// acquires a client from the inner `ClientPool` for the stream's lifetime.
|
||||
///
|
||||
/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
|
||||
/// a single request and await the response. Internally, requests are multiplexed across streams and
|
||||
/// channels. This allows proper queue depth enforcement and response routing.
|
||||
///
|
||||
/// TODO: consider making this generic over request and response types; not currently needed.
|
||||
pub struct StreamPool {
|
||||
/// The client pool to acquire clients from. Must be unbounded.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// All pooled streams.
|
||||
///
|
||||
/// Incoming requests will be sent over an existing stream with available capacity. If all
|
||||
/// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
|
||||
/// stream has an associated Tokio task that processes requests and responses.
|
||||
streams: Mutex<HashMap<StreamID, StreamEntry>>,
|
||||
/// The max number of concurrent streams, or None if unbounded.
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
/// The max number of concurrent requests per stream.
|
||||
max_queue_depth: NonZero<usize>,
|
||||
/// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
|
||||
/// None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Reaps idle streams.
|
||||
idle_reaper: Reaper,
|
||||
/// Stream ID generator.
|
||||
next_stream_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type StreamID = usize;
|
||||
type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
|
||||
|
||||
struct StreamEntry {
|
||||
/// Sends caller requests to the stream task. The stream task exits when this is dropped.
|
||||
sender: RequestSender,
|
||||
/// Number of in-flight requests on this stream.
|
||||
queue_depth: usize,
|
||||
/// The time when this stream went idle (queue_depth == 0).
|
||||
/// INVARIANT: Some if queue_depth == 0, otherwise None.
|
||||
idle_since: Option<Instant>,
|
||||
}
|
||||
|
||||
impl StreamPool {
|
||||
/// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
|
||||
/// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
|
||||
///
|
||||
/// The client pool must be unbounded. The stream pool will enforce its own limits, and because
|
||||
/// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
|
||||
/// The stream pool should generally have its own dedicated client pool (but it can share a
|
||||
/// channel pool with others since these are always unbounded).
|
||||
pub fn new(
|
||||
client_pool: Arc<ClientPool>,
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
max_queue_depth: NonZero<usize>,
|
||||
) -> Arc<Self> {
|
||||
assert!(client_pool.limiter.is_none(), "bounded client pool");
|
||||
let pool = Arc::new(Self {
|
||||
client_pool,
|
||||
streams: Mutex::default(),
|
||||
limiter: max_streams.map(|max_streams| {
|
||||
Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
|
||||
}),
|
||||
max_streams,
|
||||
max_queue_depth,
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
next_stream_id: AtomicUsize::default(),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
pool
|
||||
}
|
||||
|
||||
/// Acquires an available stream from the pool, or spins up a new stream async if all streams
|
||||
/// are full. Returns a guard that can be used to send a single request on the stream and await
|
||||
/// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
|
||||
/// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
|
||||
///
|
||||
/// This is very performance-sensitive, as it is on the GetPage hot path.
|
||||
///
|
||||
/// TODO: this must do something more sophisticated for performance. We want:
|
||||
///
|
||||
/// * Cheap, concurrent access in the common case where we can use a pooled stream.
|
||||
/// * Quick acquisition of pooled streams with available capacity.
|
||||
/// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
|
||||
/// * Prefer filling up existing streams' queue depth before spinning up new streams.
|
||||
/// * Don't hold a lock while spinning up new streams.
|
||||
/// * Allow concurrent clients to join onto streams while they're spun up.
|
||||
/// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
|
||||
///
|
||||
/// For now, we just do something simple but inefficient (linear scan under mutex).
|
||||
pub async fn get(self: &Arc<Self>) -> StreamGuard {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
let mut streams = self.streams.lock().unwrap();
|
||||
|
||||
// Look for a pooled stream with available capacity.
|
||||
for (&id, entry) in streams.iter_mut() {
|
||||
assert!(
|
||||
entry.queue_depth <= self.max_queue_depth.get(),
|
||||
"stream queue overflow"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.idle_since.is_some(),
|
||||
entry.queue_depth == 0,
|
||||
"incorrect stream idle state"
|
||||
);
|
||||
if entry.queue_depth < self.max_queue_depth.get() {
|
||||
entry.queue_depth += 1;
|
||||
entry.idle_since = None;
|
||||
return StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
sender: entry.sender.clone(),
|
||||
permit,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// No available stream, spin up a new one. We install the stream entry in the pool first and
|
||||
// return the guard, while spinning up the stream task async. This allows other callers to
|
||||
// join onto this stream and also create additional streams concurrently if this fills up.
|
||||
let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
|
||||
let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
|
||||
let entry = StreamEntry {
|
||||
sender: req_tx.clone(),
|
||||
queue_depth: 1, // reserve quota for this caller
|
||||
idle_since: None,
|
||||
};
|
||||
streams.insert(id, entry);
|
||||
|
||||
if let Some(max_streams) = self.max_streams {
|
||||
assert!(streams.len() <= max_streams.get(), "stream overflow");
|
||||
};
|
||||
|
||||
let client_pool = self.client_pool.clone();
|
||||
let pool = Arc::downgrade(self);
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = Self::run_stream(client_pool, req_rx).await {
|
||||
error!("stream failed: {err}");
|
||||
}
|
||||
// Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
|
||||
if let Some(pool) = pool.upgrade() {
|
||||
let entry = pool.streams.lock().unwrap().remove(&id);
|
||||
assert!(entry.is_some(), "unknown stream ID: {id}");
|
||||
}
|
||||
});
|
||||
|
||||
StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
sender: req_tx,
|
||||
permit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
|
||||
/// bidirectional GetPage stream, then forwards requests and responses between callers and the
|
||||
/// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
|
||||
/// atomic with pool stream acquisition.
|
||||
///
|
||||
/// The task exits when the request channel is closed, or on a stream error. The caller is
|
||||
/// responsible for removing the stream from the pool on exit.
|
||||
async fn run_stream(
|
||||
client_pool: Arc<ClientPool>,
|
||||
mut caller_rx: RequestReceiver,
|
||||
) -> anyhow::Result<()> {
|
||||
// Acquire a client from the pool and create a stream.
|
||||
let mut client = client_pool.get().await?;
|
||||
|
||||
// NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
|
||||
// theoretically deadlock if both the client and server block on sends (since we're not
|
||||
// reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
|
||||
// low queue depths, but it was seen to happen with the libpq protocol so better safe than
|
||||
// sorry. It should never buffer more than the queue depth anyway, but using an unbounded
|
||||
// channel guarantees that it will never block.
|
||||
let (req_tx, req_rx) = mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Track caller response channels by request ID. If the task returns early, these response
|
||||
// channels will be dropped and the waiting callers will receive an error.
|
||||
let mut callers = HashMap::new();
|
||||
|
||||
// Process requests and responses.
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Receive requests from callers and send them to the stream.
|
||||
req = caller_rx.recv() => {
|
||||
// Shut down if request channel is closed.
|
||||
let Some((req, resp_tx)) = req else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Store the response channel by request ID.
|
||||
if callers.contains_key(&req.request_id) {
|
||||
// Error on request ID duplicates. Ignore callers that went away.
|
||||
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
|
||||
format!("duplicate request ID: {}", req.request_id),
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
callers.insert(req.request_id, resp_tx);
|
||||
|
||||
// Send the request on the stream. Bail out if the stream is closed.
|
||||
req_tx.send(req).map_err(|_| {
|
||||
tonic::Status::unavailable("stream closed")
|
||||
})?;
|
||||
}
|
||||
|
||||
// Receive responses from the stream and send them to callers.
|
||||
resp = resp_stream.next() => {
|
||||
// Shut down if the stream is closed, and bail out on stream errors.
|
||||
let Some(resp) = resp.transpose()? else {
|
||||
return Ok(())
|
||||
};
|
||||
|
||||
// Send the response to the caller. Ignore errors if the caller went away.
|
||||
let Some(resp_tx) = callers.remove(&resp.request_id) else {
|
||||
warn!("received response for unknown request ID: {}", resp.request_id);
|
||||
continue;
|
||||
};
|
||||
_ = resp_tx.send(Ok(resp));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for StreamPool {
|
||||
/// Reaps streams that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.streams.lock().unwrap().retain(|_, entry| {
|
||||
let Some(idle_since) = entry.idle_since else {
|
||||
assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
|
||||
return true;
|
||||
};
|
||||
assert_eq!(entry.queue_depth, 0, "idle stream has requests");
|
||||
idle_since >= cutoff
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
|
||||
/// depth. Queue depth is already reserved and will be returned on drop.
|
||||
pub struct StreamGuard {
|
||||
pool: Weak<StreamPool>,
|
||||
id: StreamID,
|
||||
sender: RequestSender,
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl StreamGuard {
|
||||
/// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
|
||||
/// valid for a single request (to enforce queue depth). This also drops the guard on return and
|
||||
/// returns the queue depth quota to the pool.
|
||||
///
|
||||
/// The `GetPageRequest::request_id` must be unique across in-flight requests.
|
||||
///
|
||||
/// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
|
||||
/// to avoid tearing down the stream for per-request errors. Callers must check this.
|
||||
pub async fn send(
|
||||
self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let (resp_tx, resp_rx) = oneshot::channel();
|
||||
|
||||
self.sender
|
||||
.send((req, resp_tx))
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?;
|
||||
|
||||
resp_rx
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StreamGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
|
||||
// before the response is received, but that's okay.
|
||||
let mut streams = pool.streams.lock().unwrap();
|
||||
let entry = streams.get_mut(&self.id).expect("unknown stream");
|
||||
assert!(entry.idle_since.is_none(), "active stream marked idle");
|
||||
assert!(entry.queue_depth > 0, "stream queue underflow");
|
||||
entry.queue_depth -= 1;
|
||||
if entry.queue_depth == 0 {
|
||||
entry.idle_since = Some(Instant::now()); // mark stream as idle
|
||||
}
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
|
||||
/// Periodically reaps idle resources from a pool.
|
||||
struct Reaper {
|
||||
/// The task check interval.
|
||||
interval: Duration,
|
||||
/// The threshold for reaping idle resources.
|
||||
threshold: Duration,
|
||||
/// Cancels the reaper task. Cancelled when the reaper is dropped.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Reaper {
|
||||
/// Creates a new reaper.
|
||||
pub fn new(threshold: Duration, interval: Duration) -> Self {
|
||||
Self {
|
||||
cancel: CancellationToken::new(),
|
||||
threshold,
|
||||
interval,
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawns a task to periodically reap idle resources from the given task pool. The task is
|
||||
/// cancelled when the reaper is dropped.
|
||||
pub fn spawn(&self, pool: &Arc<impl Reapable>) {
|
||||
// NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
|
||||
let pool = Arc::downgrade(pool);
|
||||
let cancel = self.cancel.clone();
|
||||
let (interval, threshold) = (self.interval, self.threshold);
|
||||
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(interval) => {
|
||||
let Some(pool) = pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
pool.reap_idle(Instant::now() - threshold);
|
||||
}
|
||||
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Reaper {
|
||||
fn drop(&mut self) {
|
||||
self.cancel.cancel(); // cancel reaper task
|
||||
}
|
||||
}
|
||||
|
||||
/// A reapable resource pool.
|
||||
trait Reapable: Send + Sync + 'static {
|
||||
/// Reaps resources that have been idle since before the given cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant);
|
||||
}
|
||||
151
pageserver/client_grpc/src/retry.rs
Normal file
151
pageserver/client_grpc/src/retry.rs
Normal file
@@ -0,0 +1,151 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::Instant;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
|
||||
/// A retry handler for Pageserver gRPC requests.
|
||||
///
|
||||
/// This is used instead of backoff::retry for better control and observability.
|
||||
pub struct Retry;
|
||||
|
||||
impl Retry {
|
||||
/// The per-request timeout.
|
||||
// TODO: tune these, and/or make them configurable. Should we retry forever?
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
/// The total timeout across all attempts
|
||||
const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// The initial backoff duration.
|
||||
const BASE_BACKOFF: Duration = Duration::from_millis(10);
|
||||
/// The maximum backoff duration.
|
||||
const MAX_BACKOFF: Duration = Duration::from_secs(10);
|
||||
/// If true, log successful requests. For debugging.
|
||||
const LOG_SUCCESS: bool = false;
|
||||
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
|
||||
/// using the current tracing span for context.
|
||||
///
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
|
||||
/// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
|
||||
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut() -> O,
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
let started = Instant::now();
|
||||
let deadline = started + Self::TOTAL_TIMEOUT;
|
||||
let mut last_error = None;
|
||||
let mut retries = 0;
|
||||
loop {
|
||||
// Set up a future to wait for the backoff (if any) and run the request with a timeout.
|
||||
let backoff_and_try = async {
|
||||
// NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
|
||||
// https://github.com/tokio-rs/tokio/issues/6866
|
||||
if let Some(backoff) = Self::backoff_duration(retries) {
|
||||
tokio::time::sleep(backoff).await;
|
||||
}
|
||||
|
||||
let request_started = Instant::now();
|
||||
tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
request_started.elapsed().as_secs_f64()
|
||||
))
|
||||
})?
|
||||
};
|
||||
|
||||
// Wait for the backoff and request, or bail out if the total timeout is exceeded.
|
||||
let result = tokio::select! {
|
||||
result = backoff_and_try => result,
|
||||
|
||||
_ = tokio::time::sleep_until(deadline) => {
|
||||
let last_error = last_error.unwrap_or_else(|| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
started.elapsed().as_secs_f64()
|
||||
))
|
||||
});
|
||||
error!(
|
||||
"giving up after {:.3}s and {retries} retries, last error {:?}: {}",
|
||||
started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
|
||||
);
|
||||
return Err(last_error);
|
||||
}
|
||||
};
|
||||
|
||||
match result {
|
||||
// Success, return the result.
|
||||
Ok(result) => {
|
||||
if retries > 0 || Self::LOG_SUCCESS {
|
||||
info!(
|
||||
"request succeeded after {retries} retries in {:.3}s",
|
||||
started.elapsed().as_secs_f64(),
|
||||
);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Error, retry or bail out.
|
||||
Err(status) => {
|
||||
let (code, message) = (status.code(), status.message());
|
||||
let attempt = retries + 1;
|
||||
|
||||
if !Self::should_retry(code) {
|
||||
// NB: include the attempt here too. This isn't necessarily the first
|
||||
// attempt, because the error may change between attempts.
|
||||
error!(
|
||||
"request failed with {code:?}: {message}, not retrying (attempt {attempt})"
|
||||
);
|
||||
return Err(status);
|
||||
}
|
||||
|
||||
warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
|
||||
|
||||
retries += 1;
|
||||
last_error = Some(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the backoff duration for the given retry attempt, or None for no backoff.
|
||||
fn backoff_duration(retry: usize) -> Option<Duration> {
|
||||
let backoff = exponential_backoff_duration(
|
||||
retry as u32,
|
||||
Self::BASE_BACKOFF.as_secs_f64(),
|
||||
Self::MAX_BACKOFF.as_secs_f64(),
|
||||
);
|
||||
(!backoff.is_zero()).then_some(backoff)
|
||||
}
|
||||
|
||||
/// Returns true if the given status code should be retries.
|
||||
fn should_retry(code: tonic::Code) -> bool {
|
||||
match code {
|
||||
tonic::Code::Ok => panic!("unexpected Ok status code"),
|
||||
|
||||
// These codes are transient, so retry them.
|
||||
tonic::Code::Aborted => true,
|
||||
tonic::Code::Cancelled => true,
|
||||
tonic::Code::DeadlineExceeded => true, // maybe transient slowness
|
||||
tonic::Code::Internal => true, // maybe transient failure?
|
||||
tonic::Code::ResourceExhausted => true,
|
||||
tonic::Code::Unavailable => true,
|
||||
|
||||
// The following codes will like continue to fail, so don't retry.
|
||||
tonic::Code::AlreadyExists => false,
|
||||
tonic::Code::DataLoss => false,
|
||||
tonic::Code::FailedPrecondition => false,
|
||||
tonic::Code::InvalidArgument => false,
|
||||
tonic::Code::NotFound => false,
|
||||
tonic::Code::OutOfRange => false,
|
||||
tonic::Code::PermissionDenied => false,
|
||||
tonic::Code::Unauthenticated => false,
|
||||
tonic::Code::Unimplemented => false,
|
||||
tonic::Code::Unknown => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
172
pageserver/client_grpc/src/split.rs
Normal file
172
pageserver/client_grpc/src/split.rs
Normal file
@@ -0,0 +1,172 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// The original request ID. Used for all shard requests.
|
||||
request_id: page_api::RequestID,
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
/// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
|
||||
/// the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
/// Page responses by shard index. Will be assembled into a single response.
|
||||
responses: HashMap<ShardIndex, Vec<Bytes>>,
|
||||
}
|
||||
|
||||
impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
/// The caller must ensure that the request has at least one block number, or this will panic.
|
||||
pub fn is_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Option<ShardIndex> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Some(ShardIndex::unsharded());
|
||||
}
|
||||
|
||||
// Find the base shard index for the first page, and compare with the rest.
|
||||
let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
|
||||
req.block_numbers
|
||||
.iter()
|
||||
.skip(1) // computed above
|
||||
.all(|&blkno| {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
key_to_shard_number(count, stripe_size, &key) == shard_number
|
||||
})
|
||||
.then_some(ShardIndex::new(shard_number, count))
|
||||
}
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Self {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::is_single_shard(&req, count, stripe_size).is_none(),
|
||||
"unnecessary request split"
|
||||
);
|
||||
|
||||
// Split the requests by shard index.
|
||||
let mut requests = HashMap::with_capacity(2); // common case
|
||||
let mut block_shards = Vec::with_capacity(req.block_numbers.len());
|
||||
for blkno in req.block_numbers {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
let shard_id = ShardIndex::new(shard_number, count);
|
||||
|
||||
let shard_req = requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
read_lsn: req.read_lsn,
|
||||
block_numbers: Vec::new(),
|
||||
});
|
||||
shard_req.block_numbers.push(blkno);
|
||||
block_shards.push(shard_id);
|
||||
}
|
||||
|
||||
Self {
|
||||
request_id: req.request_id,
|
||||
responses: HashMap::with_capacity(requests.len()),
|
||||
requests,
|
||||
block_shards,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> tonic::Result<()> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
|
||||
|
||||
// Make sure the response matches the request ID.
|
||||
if response.request_id != self.request_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID {} does not match request ID {}",
|
||||
response.request_id, self.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Add the response data to the map.
|
||||
let old = self.responses.insert(shard_id, response.page_images);
|
||||
|
||||
if old.is_some() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"duplicate response for shard {shard_id}",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Assembles the shard responses into a single response. Responses must be present for all
|
||||
/// relevant shards, and the total number of pages must match the original request.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let mut response = page_api::GetPageResponse {
|
||||
request_id: self.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
page_images: Vec::with_capacity(self.block_shards.len()),
|
||||
};
|
||||
|
||||
// Set up per-shard page iterators we can pull from.
|
||||
let mut shard_responses = HashMap::with_capacity(self.responses.len());
|
||||
for (shard_id, responses) in self.responses {
|
||||
shard_responses.insert(shard_id, responses.into_iter());
|
||||
}
|
||||
|
||||
// Reassemble the responses in the same order as the original request.
|
||||
for shard_id in &self.block_shards {
|
||||
let page = shard_responses
|
||||
.get_mut(shard_id)
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::internal(format!("missing response for shard {shard_id}"))
|
||||
})?
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::internal(format!("missing page from shard {shard_id}"))
|
||||
})?;
|
||||
response.page_images.push(page);
|
||||
}
|
||||
|
||||
// Make sure there are no additional pages.
|
||||
for (shard_id, mut pages) in shard_responses {
|
||||
if pages.next().is_some() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"extra pages returned from shard {shard_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,101 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver::tenant::{
|
||||
IndexPart,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
remote_timeline_client::remote_layer_path,
|
||||
storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
|
||||
};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::TenantShardId,
|
||||
};
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
pub(crate) enum IndexPartCmd {
|
||||
Dump { path: Utf8PathBuf },
|
||||
Dump {
|
||||
path: Utf8PathBuf,
|
||||
},
|
||||
/// Find all layers that need to be searched to construct the given page at the given LSN.
|
||||
Search {
|
||||
#[arg(long)]
|
||||
tenant_id: String,
|
||||
#[arg(long)]
|
||||
timeline_id: String,
|
||||
#[arg(long)]
|
||||
path: Utf8PathBuf,
|
||||
#[arg(long)]
|
||||
key: String,
|
||||
#[arg(long)]
|
||||
lsn: String,
|
||||
},
|
||||
}
|
||||
|
||||
async fn search_layers(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
path: &Utf8PathBuf,
|
||||
key: &str,
|
||||
lsn: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::from_str(tenant_id).unwrap();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::from_str(timeline_id).unwrap();
|
||||
let index_json = {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
IndexPart::from_json_bytes(&bytes).unwrap()
|
||||
};
|
||||
let mut layer_map = LayerMap::default();
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (key, value) in index_json.layer_metadata.iter() {
|
||||
updates.insert_historic(PersistentLayerDesc::from_filename(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key.clone(),
|
||||
value.file_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
let key = Key::from_hex(key)?;
|
||||
|
||||
let lsn = Lsn::from_str(lsn).unwrap();
|
||||
let mut end_lsn = lsn;
|
||||
loop {
|
||||
let result = layer_map.search(key, end_lsn);
|
||||
match result {
|
||||
Some(SearchResult { layer, lsn_floor }) => {
|
||||
let disk_layer = match layer {
|
||||
ReadableLayerWeak::PersistentLayer(layer) => layer,
|
||||
ReadableLayerWeak::InMemoryLayer(_) => {
|
||||
anyhow::bail!("unexpected in-memory layer")
|
||||
}
|
||||
};
|
||||
|
||||
let metadata = index_json
|
||||
.layer_metadata
|
||||
.get(&disk_layer.layer_name())
|
||||
.unwrap();
|
||||
println!(
|
||||
"{}",
|
||||
remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
metadata.shard,
|
||||
&disk_layer.layer_name(),
|
||||
metadata.generation
|
||||
)
|
||||
);
|
||||
end_lsn = lsn_floor;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
@@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
}
|
||||
IndexPartCmd::Search {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
path,
|
||||
key,
|
||||
lsn,
|
||||
} => search_layers(tenant_id, timeline_id, path, key, lsn).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,151 @@
|
||||
use anyhow::Result;
|
||||
use anyhow::Context as _;
|
||||
use futures::{Stream, StreamExt as _, TryStreamExt as _};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::metadata::errors::InvalidMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
use tonic::{Request, Streaming};
|
||||
use tonic::service::Interceptor;
|
||||
use tonic::service::interceptor::InterceptedService;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::model;
|
||||
use crate::model::*;
|
||||
use crate::proto;
|
||||
|
||||
///
|
||||
/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
|
||||
/// headers are required at the pageserver.
|
||||
///
|
||||
/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
|
||||
/// types from `model` rather than generated Protobuf types.
|
||||
pub struct Client {
|
||||
inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
/// Connects to the given gRPC endpoint.
|
||||
pub async fn connect<E>(
|
||||
endpoint: E,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self>
|
||||
where
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?;
|
||||
let channel = endpoint.connect().await?;
|
||||
Self::new(
|
||||
channel,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a new client using the given gRPC channel.
|
||||
pub fn new(
|
||||
channel: Channel,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
|
||||
let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
inner = inner
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
}
|
||||
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
pub async fn check_rel_exists(
|
||||
&mut self,
|
||||
req: CheckRelExistsRequest,
|
||||
) -> tonic::Result<CheckRelExistsResponse> {
|
||||
let req = proto::CheckRelExistsRequest::from(req);
|
||||
let resp = self.inner.check_rel_exists(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches a base backup.
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: GetBaseBackupRequest,
|
||||
) -> tonic::Result<impl AsyncRead + use<>> {
|
||||
let req = proto::GetBaseBackupRequest::from(req);
|
||||
let chunks = self.inner.get_base_backup(req).await?.into_inner();
|
||||
Ok(StreamReader::new(
|
||||
chunks
|
||||
.map_ok(|resp| resp.chunk)
|
||||
.map_err(std::io::Error::other),
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
|
||||
let req = proto::GetDbSizeRequest::from(req);
|
||||
let resp = self.inner.get_db_size(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches pages.
|
||||
///
|
||||
/// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
|
||||
/// typically returned as status_code instead of errors, to avoid tearing down the entire stream
|
||||
/// via a tonic::Status error.
|
||||
pub async fn get_pages(
|
||||
&mut self,
|
||||
reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
|
||||
) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
|
||||
let reqs = reqs.map(proto::GetPageRequest::from);
|
||||
let resps = self.inner.get_pages(reqs).await?.into_inner();
|
||||
Ok(resps.map_ok(GetPageResponse::from))
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
pub async fn get_rel_size(
|
||||
&mut self,
|
||||
req: GetRelSizeRequest,
|
||||
) -> tonic::Result<GetRelSizeResponse> {
|
||||
let req = proto::GetRelSizeRequest::from(req);
|
||||
let resp = self.inner.get_rel_size(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
pub async fn get_slru_segment(
|
||||
&mut self,
|
||||
req: GetSlruSegmentRequest,
|
||||
) -> tonic::Result<GetSlruSegmentResponse> {
|
||||
let req = proto::GetSlruSegmentRequest::from(req);
|
||||
let resp = self.inner.get_slru_segment(req).await?.into_inner();
|
||||
Ok(resp.try_into()?)
|
||||
}
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
|
||||
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
///
|
||||
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
|
||||
/// acquired because the LSN has already been garbage collected.
|
||||
pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
|
||||
let req = proto::LeaseLsnRequest::from(req);
|
||||
let resp = self.inner.lease_lsn(req).await?.into_inner();
|
||||
Ok(resp.try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds authentication metadata to gRPC requests.
|
||||
#[derive(Clone)]
|
||||
struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
@@ -30,174 +158,29 @@ impl AuthInterceptor {
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
auth_token: Option<String>,
|
||||
shard_id: ShardIndex,
|
||||
) -> Result<Self, InvalidMetadataValue> {
|
||||
let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
|
||||
let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
|
||||
let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
|
||||
|
||||
let auth_header: Option<AsciiMetadataValue> = match auth_token {
|
||||
Some(token) => Some(format!("Bearer {token}").try_into()?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
tenant_id: tenant_ascii,
|
||||
shard_id: shard_ascii,
|
||||
timeline_id: timeline_ascii,
|
||||
auth_header,
|
||||
tenant_id: tenant_id.to_string().try_into()?,
|
||||
timeline_id: timeline_id.to_string().try_into()?,
|
||||
shard_id: shard_id.to_string().try_into()?,
|
||||
auth_header: auth_token
|
||||
.map(|token| format!("Bearer {token}").try_into())
|
||||
.transpose()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-shard-id", self.shard_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_header) = &self.auth_header {
|
||||
req.metadata_mut()
|
||||
.insert("authorization", auth_header.clone());
|
||||
impl Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
metadata.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
metadata.insert("neon-shard-id", self.shard_id.clone());
|
||||
if let Some(ref auth_header) = self.auth_header {
|
||||
metadata.insert("authorization", auth_header.clone());
|
||||
}
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Client {
|
||||
client: proto::PageServiceClient<
|
||||
tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
|
||||
into_endpoint: T,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_header: Option<String>,
|
||||
compression: Option<tonic::codec::CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let endpoint: tonic::transport::Endpoint = into_endpoint
|
||||
.try_into()
|
||||
.map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
|
||||
let channel = endpoint.connect().await?;
|
||||
let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
|
||||
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
|
||||
let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
client = client
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
}
|
||||
|
||||
Ok(Self { client })
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
pub async fn check_rel_exists(
|
||||
&mut self,
|
||||
req: model::CheckRelExistsRequest,
|
||||
) -> Result<model::CheckRelExistsResponse, tonic::Status> {
|
||||
let proto_req = proto::CheckRelExistsRequest::from(req);
|
||||
|
||||
let response = self.client.check_rel_exists(proto_req).await?;
|
||||
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches a base backup.
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: model::GetBaseBackupRequest,
|
||||
) -> Result<impl AsyncRead + use<>, tonic::Status> {
|
||||
let req = proto::GetBaseBackupRequest::from(req);
|
||||
let chunks = self.client.get_base_backup(req).await?.into_inner();
|
||||
let reader = StreamReader::new(
|
||||
chunks
|
||||
.map_ok(|resp| resp.chunk)
|
||||
.map_err(std::io::Error::other),
|
||||
);
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
pub async fn get_db_size(
|
||||
&mut self,
|
||||
req: model::GetDbSizeRequest,
|
||||
) -> Result<u64, tonic::Status> {
|
||||
let proto_req = proto::GetDbSizeRequest::from(req);
|
||||
|
||||
let response = self.client.get_db_size(proto_req).await?;
|
||||
Ok(response.into_inner().into())
|
||||
}
|
||||
|
||||
/// Fetches pages.
|
||||
///
|
||||
/// This is implemented as a bidirectional streaming RPC for performance.
|
||||
/// Per-request errors are often returned as status_code instead of errors,
|
||||
/// to avoid tearing down the entire stream via tonic::Status.
|
||||
pub async fn get_pages<ReqSt>(
|
||||
&mut self,
|
||||
inbound: ReqSt,
|
||||
) -> Result<
|
||||
impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
|
||||
tonic::Status,
|
||||
>
|
||||
where
|
||||
ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
|
||||
{
|
||||
let outbound_proto = inbound.map(|domain_req| domain_req.into());
|
||||
|
||||
let req_new = Request::new(outbound_proto);
|
||||
|
||||
let response_stream: Streaming<proto::GetPageResponse> =
|
||||
self.client.get_pages(req_new).await?.into_inner();
|
||||
|
||||
let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
|
||||
|
||||
Ok(domain_stream)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
pub async fn get_rel_size(
|
||||
&mut self,
|
||||
req: model::GetRelSizeRequest,
|
||||
) -> Result<model::GetRelSizeResponse, tonic::Status> {
|
||||
let proto_req = proto::GetRelSizeRequest::from(req);
|
||||
let response = self.client.get_rel_size(proto_req).await?;
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
pub async fn get_slru_segment(
|
||||
&mut self,
|
||||
req: model::GetSlruSegmentRequest,
|
||||
) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
|
||||
let proto_req = proto::GetSlruSegmentRequest::from(req);
|
||||
let response = self.client.get_slru_segment(proto_req).await?;
|
||||
Ok(response.into_inner().try_into()?)
|
||||
}
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
|
||||
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
///
|
||||
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
|
||||
/// acquired because the LSN has already been garbage collected.
|
||||
pub async fn lease_lsn(
|
||||
&mut self,
|
||||
req: model::LeaseLsnRequest,
|
||||
) -> Result<model::LeaseLsnResponse, tonic::Status> {
|
||||
let req = proto::LeaseLsnRequest::from(req);
|
||||
Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -384,7 +384,7 @@ impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
pub type RequestID = u64;
|
||||
|
||||
/// A GetPage request class.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, strum_macros::Display)]
|
||||
pub enum GetPageClass {
|
||||
/// Unknown class. For backwards compatibility: used when an older client version sends a class
|
||||
/// that a newer server version has removed.
|
||||
@@ -397,6 +397,19 @@ pub enum GetPageClass {
|
||||
Background,
|
||||
}
|
||||
|
||||
impl GetPageClass {
|
||||
/// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
|
||||
/// latency-sensitive).
|
||||
pub fn is_bulk(&self) -> bool {
|
||||
match self {
|
||||
Self::Unknown => false,
|
||||
Self::Normal => false,
|
||||
Self::Prefetch => true,
|
||||
Self::Background => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<proto::GetPageClass> for GetPageClass {
|
||||
fn from(pb: proto::GetPageClass) -> Self {
|
||||
match pb {
|
||||
@@ -602,6 +615,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetPageStatusCode> for tonic::Code {
|
||||
fn from(status_code: GetPageStatusCode) -> Self {
|
||||
use tonic::Code;
|
||||
|
||||
match status_code {
|
||||
GetPageStatusCode::Unknown => Code::Unknown,
|
||||
GetPageStatusCode::Ok => Code::Ok,
|
||||
GetPageStatusCode::NotFound => Code::NotFound,
|
||||
GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
|
||||
GetPageStatusCode::InternalError => Code::Internal,
|
||||
GetPageStatusCode::SlowDown => Code::ResourceExhausted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -326,7 +326,7 @@ impl GrpcClient {
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let inner = page_api::Client::new(
|
||||
let inner = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
|
||||
@@ -625,7 +625,7 @@ impl GrpcClient {
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
|
||||
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
|
||||
};
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
|
||||
MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -763,6 +764,41 @@ fn start_pageserver(
|
||||
(http_task, https_task)
|
||||
};
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let metrics_collection_task = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
let background_jobs_barrier = background_jobs_barrier.clone();
|
||||
async move {
|
||||
if conf.force_metric_collection_on_scrape {
|
||||
return;
|
||||
}
|
||||
|
||||
// first wait until background jobs are cleared to launch.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return; },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("cancelled metrics collection task, exiting...");
|
||||
break;
|
||||
},
|
||||
_ = interval.tick() => {}
|
||||
}
|
||||
tokio::task::spawn_blocking(|| {
|
||||
METRICS_COLLECTOR.run_once(true);
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
MetricsCollectionTask(CancellableTask { task, cancel })
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
@@ -844,6 +880,7 @@ fn start_pageserver(
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
page_service_grpc,
|
||||
metrics_collection_task,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
@@ -889,8 +926,11 @@ async fn create_remote_storage_client(
|
||||
"Simulating remote failures for first {} attempts of each op",
|
||||
conf.test_remote_failures
|
||||
);
|
||||
remote_storage =
|
||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
||||
remote_storage = GenericRemoteStorage::unreliable_wrapper(
|
||||
remote_storage,
|
||||
conf.test_remote_failures,
|
||||
conf.test_remote_failures_probability,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(remote_storage)
|
||||
|
||||
@@ -145,9 +145,13 @@ pub struct PageServerConf {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
|
||||
// The number of allowed failures in remote storage operations.
|
||||
pub test_remote_failures: u64,
|
||||
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
|
||||
// Use 100 for 100% failure, 0 for no failure.
|
||||
pub test_remote_failures_probability: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
|
||||
@@ -248,6 +252,14 @@ pub struct PageServerConf {
|
||||
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
|
||||
|
||||
pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
|
||||
|
||||
/// Defines what is a big tenant for the purpose of image layer generation.
|
||||
/// See Timeline::should_check_if_image_layers_required
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
|
||||
/// Controls whether to collect all metrics on each scrape or to return potentially stale
|
||||
/// results.
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -392,6 +404,7 @@ impl PageServerConf {
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
@@ -427,6 +440,8 @@ impl PageServerConf {
|
||||
posthog_config,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -461,6 +476,7 @@ impl PageServerConf {
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api: control_plane_api
|
||||
@@ -484,6 +500,8 @@ impl PageServerConf {
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -625,7 +643,7 @@ impl PageServerConf {
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
let config_toml = pageserver_api::config::ConfigToml {
|
||||
let mut config_toml = pageserver_api::config::ConfigToml {
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
pg_distrib_dir: Some(pg_distrib_dir),
|
||||
@@ -637,6 +655,15 @@ impl PageServerConf {
|
||||
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Test authors tend to forget about the default 10min initial lease deadline
|
||||
// when writing tests, which turns their immediate gc requests via mgmt API
|
||||
// into no-ops. Override the binary default here, such that there is no initial
|
||||
// lease deadline by default in tests. Tests that care can always override it
|
||||
// themselves.
|
||||
// Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
|
||||
config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
|
||||
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
}
|
||||
}
|
||||
@@ -697,9 +724,12 @@ impl ConfigurableSemaphore {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder};
|
||||
use rstest::rstest;
|
||||
use utils::id::NodeId;
|
||||
use utils::{id::NodeId, serde_percent::Percent};
|
||||
|
||||
use super::PageServerConf;
|
||||
|
||||
@@ -798,4 +828,70 @@ mod tests {
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[rstest]
|
||||
#[
|
||||
case::omit_the_whole_config(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: Default::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#,
|
||||
)]
|
||||
#[
|
||||
case::omit_enabled_field(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 1_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" }
|
||||
"#,
|
||||
)]
|
||||
#[case::disabled(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: false,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { enabled = false }
|
||||
"#
|
||||
)]
|
||||
fn test_config_disk_usage_based_eviction_is_valid(
|
||||
#[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
#[case] input: &str,
|
||||
) {
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("disk_usage_based_eviction is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction;
|
||||
assert_eq!(
|
||||
expected_disk_usage_based_eviction,
|
||||
disk_usage_based_eviction
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> Option<DiskUsageEvictionTask> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
let task_config = &conf.disk_usage_based_eviction;
|
||||
if !task_config.enabled {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return None;
|
||||
};
|
||||
@@ -458,6 +459,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
match next {
|
||||
Ok(Ok(file_size)) => {
|
||||
METRICS.layers_evicted.inc();
|
||||
/*BEGIN_HADRON */
|
||||
METRICS.bytes_evicted.inc_by(file_size);
|
||||
/*END_HADRON */
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Ok(Err((
|
||||
@@ -1265,6 +1269,7 @@ mod filesystem_level_usage {
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: pageserver_api::config::EvictionOrder::default(),
|
||||
enabled: true,
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, atomic::AtomicBool},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
@@ -355,11 +359,17 @@ impl PerTenantProperties {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TenantFeatureResolver {
|
||||
inner: FeatureResolver,
|
||||
tenant_id: TenantId,
|
||||
cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
cached_tenant_properties: ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
|
||||
// Add feature flag on the critical path below.
|
||||
//
|
||||
// If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of
|
||||
// resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the
|
||||
// housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions.
|
||||
pub feature_test_remote_size_flag: AtomicBool,
|
||||
}
|
||||
|
||||
impl TenantFeatureResolver {
|
||||
@@ -367,7 +377,8 @@ impl TenantFeatureResolver {
|
||||
Self {
|
||||
inner,
|
||||
tenant_id,
|
||||
cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())),
|
||||
feature_test_remote_size_flag: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,12 +407,14 @@ impl TenantFeatureResolver {
|
||||
self.inner.is_feature_flag_boolean(flag_key)
|
||||
}
|
||||
|
||||
pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = None;
|
||||
/// Refresh the cached properties and flags on the critical path.
|
||||
pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = Some(0.0);
|
||||
for timeline in tenant_shard.list_timelines() {
|
||||
let size = timeline.metrics.resident_physical_size_get();
|
||||
if size == 0 {
|
||||
remote_size_mb = None;
|
||||
break;
|
||||
}
|
||||
if let Some(ref mut remote_size_mb) = remote_size_mb {
|
||||
*remote_size_mb += size as f64 / 1024.0 / 1024.0;
|
||||
@@ -410,5 +423,12 @@ impl TenantFeatureResolver {
|
||||
self.cached_tenant_properties.store(Arc::new(
|
||||
PerTenantProperties { remote_size_mb }.into_posthog_properties(),
|
||||
));
|
||||
|
||||
// BEGIN: Update the feature flag on the critical path.
|
||||
self.feature_test_remote_size_flag.store(
|
||||
self.evaluate_boolean("test-remote-size-flag").is_ok(),
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
// END: Update the feature flag on the critical path.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,26 +116,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -618,7 +598,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -685,6 +665,17 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
@@ -767,7 +758,7 @@ paths:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
/v1/tenant:
|
||||
get:
|
||||
description: Get tenants list
|
||||
responses:
|
||||
@@ -847,7 +838,7 @@ paths:
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
/v1/tenant/{tenant_id}/config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::context;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::feature_resolver::FeatureResolver;
|
||||
use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationConf;
|
||||
@@ -78,8 +79,8 @@ use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerNa
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
|
||||
CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
|
||||
WaitLsnWaiter, import_pgdata,
|
||||
};
|
||||
use crate::tenant::{
|
||||
GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
|
||||
@@ -2499,12 +2500,10 @@ async fn timeline_checkpoint_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e),
|
||||
CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
if e.is_cancel() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e.into_anyhow())
|
||||
}
|
||||
)?;
|
||||
}
|
||||
@@ -3630,6 +3629,17 @@ async fn activate_post_import_handler(
|
||||
.await
|
||||
}
|
||||
|
||||
// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts
|
||||
// after we manually rectify situations such as local SSD data loss. We will eventually automate this.
|
||||
async fn hadron_reset_alert_gauges(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(0);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
@@ -3682,6 +3692,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn force_refresh_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant
|
||||
.feature_resolver
|
||||
.refresh_properties_and_flags(&tenant);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_evaluate_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3698,7 +3725,7 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// and we don't need to worry about it for now.
|
||||
let properties = tenant.feature_resolver.collect_properties();
|
||||
if as_type.as_deref() == Some("boolean") {
|
||||
@@ -3911,9 +3938,14 @@ pub fn make_router(
|
||||
.expect("construct launch timestamp header middleware"),
|
||||
);
|
||||
|
||||
let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
|
||||
|
||||
let prometheus_metrics_handler_wrapper =
|
||||
move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
|
||||
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
.get("/profile/heap", |r| request_span(r, profile_heap_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
@@ -4147,6 +4179,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
|
||||
api_handler(r, force_refresh_feature_flag)
|
||||
})
|
||||
.put("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
|
||||
})
|
||||
@@ -4156,5 +4191,8 @@ pub fn make_router(
|
||||
.post("/v1/feature_flag_spec", |r| {
|
||||
api_handler(r, update_feature_flag_spec)
|
||||
})
|
||||
.post("/hadron-internal/reset_alert_gauges", |r| {
|
||||
api_handler(r, hadron_reset_alert_gauges)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct HttpsEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
// HADRON
|
||||
pub struct MetricsCollectionTask(pub CancellableTask);
|
||||
|
||||
impl CancellableTask {
|
||||
pub async fn shutdown(self) {
|
||||
self.cancel.cancel();
|
||||
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
grpc_task: Option<CancellableTask>,
|
||||
metrics_collection_task: MetricsCollectionTask,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
// HADRON
|
||||
timed(
|
||||
metrics_collection_task.0.shutdown(),
|
||||
"shutdown metrics collections metrics",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
consumption_metrics_worker.0.shutdown(),
|
||||
"shutdown consumption metrics",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::RawFd;
|
||||
@@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
|
||||
/* BEGIN_HADRON */
|
||||
pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_active_storage_operations_count",
|
||||
"Count of active storage operations with operation, tenant and timeline dimensions",
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
/*END_HADRON */
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
@@ -2810,6 +2822,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_local_data_loss_suspected",
|
||||
"Non-zero value indicates that pageserver local data loss is suspected (and highly likely)."
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish
|
||||
// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by
|
||||
// Postgres compute to fetch data from pageservers.
|
||||
// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the
|
||||
// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify
|
||||
// issues with compute configuration, caused by either the compute node itself being stuck in the wrong
|
||||
// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration
|
||||
// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests
|
||||
// are indicative of bugs (and unavailability).
|
||||
pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_misrouted_pagestream_requests_total",
|
||||
"Number of pageserver pagestream requests that were routed to the wrong pageserver"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
//
|
||||
// We collect the time spent in actual WAL redo ('redo'), and time waiting
|
||||
@@ -3048,13 +3085,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
stopped: Cell<bool>,
|
||||
}
|
||||
|
||||
impl StorageTimeMetricsTimer {
|
||||
fn new(metrics: StorageTimeMetrics) -> Self {
|
||||
/*BEGIN_HADRON */
|
||||
// record the active operation as the timer starts
|
||||
metrics.timeline_active_count.inc();
|
||||
/*END_HADRON */
|
||||
Self {
|
||||
metrics,
|
||||
start: Instant::now(),
|
||||
stopped: Cell::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3070,6 +3113,10 @@ impl StorageTimeMetricsTimer {
|
||||
self.metrics.timeline_sum.inc_by(seconds);
|
||||
self.metrics.timeline_count.inc();
|
||||
self.metrics.global_histogram.observe(seconds);
|
||||
/* BEGIN_HADRON*/
|
||||
self.stopped.set(true);
|
||||
self.metrics.timeline_active_count.dec();
|
||||
/*END_HADRON */
|
||||
duration
|
||||
}
|
||||
|
||||
@@ -3080,6 +3127,16 @@ impl StorageTimeMetricsTimer {
|
||||
}
|
||||
}
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
impl Drop for StorageTimeMetricsTimer {
|
||||
fn drop(&mut self) {
|
||||
if !self.stopped.get() {
|
||||
self.metrics.timeline_active_count.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
/*END_HADRON */
|
||||
|
||||
pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
|
||||
|
||||
impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
|
||||
@@ -3105,6 +3162,10 @@ pub(crate) struct StorageTimeMetrics {
|
||||
timeline_sum: Counter,
|
||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||
timeline_count: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
/// Number of active operations per operation, tenant_id, and timeline_id
|
||||
timeline_active_count: IntGauge,
|
||||
/*END_HADRON */
|
||||
/// Global histogram having only the "operation" label.
|
||||
global_histogram: Histogram,
|
||||
}
|
||||
@@ -3124,6 +3185,11 @@ impl StorageTimeMetrics {
|
||||
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*BEGIN_HADRON */
|
||||
let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
let global_histogram = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[operation])
|
||||
.unwrap();
|
||||
@@ -3131,6 +3197,7 @@ impl StorageTimeMetrics {
|
||||
StorageTimeMetrics {
|
||||
timeline_sum,
|
||||
timeline_count,
|
||||
timeline_active_count,
|
||||
global_histogram,
|
||||
}
|
||||
}
|
||||
@@ -3544,6 +3611,14 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/* BEGIN_HADRON */
|
||||
let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[
|
||||
op,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
for op in StorageIoSizeOperation::VARIANTS {
|
||||
@@ -4336,6 +4411,9 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
pub(crate) layers_collected: IntCounter,
|
||||
pub(crate) layers_selected: IntCounter,
|
||||
pub(crate) layers_evicted: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
pub(crate) bytes_evicted: IntCounter,
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
impl Default for Metrics {
|
||||
@@ -4372,12 +4450,21 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
let bytes_evicted = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_evicted_bytes_total",
|
||||
"Amount of bytes successfully evicted"
|
||||
)
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
|
||||
Self {
|
||||
tenant_collection_time,
|
||||
tenant_layer_count,
|
||||
layers_collected,
|
||||
layers_selected,
|
||||
layers_evicted,
|
||||
bytes_evicted,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4497,6 +4584,7 @@ pub fn preinitialize_metrics(
|
||||
&CIRCUIT_BREAKERS_UNBROKEN,
|
||||
&PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
|
||||
&WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS,
|
||||
&MISROUTED_PAGESTREAM_REQUESTS,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -4534,6 +4622,7 @@ pub fn preinitialize_metrics(
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
LOCAL_DATA_LOSS_SUSPECTED.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
|
||||
@@ -50,6 +50,7 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::service::Interceptor as _;
|
||||
use tonic::transport::server::TcpConnectInfo;
|
||||
use tracing::*;
|
||||
use utils::auth::{Claims, Scope, SwappableJwtAuth};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
@@ -69,7 +70,7 @@ use crate::context::{
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{LsnRange, Version};
|
||||
use crate::span::{
|
||||
@@ -90,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation};
|
||||
/// is not yet in state [`TenantState::Active`].
|
||||
///
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
/// HADRON: reduced timeout and we will retry in Cache::get().
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
/// Threshold at which to log slow GetPage requests.
|
||||
const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
@@ -1127,6 +1129,7 @@ impl PageServerHandler {
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
MISROUTED_PAGESTREAM_REQUESTS.inc();
|
||||
return respond_error!(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
@@ -3350,6 +3353,8 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
@@ -3685,8 +3690,15 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status.
|
||||
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
Err(status) => {
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
});
|
||||
page_api::GetPageResponse::try_from_status(status, req_id)?.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3824,40 +3836,85 @@ impl<S: tonic::server::NamedService> tonic::server::NamedService for Observabili
|
||||
const NAME: &'static str = S::NAME; // propagate inner service name
|
||||
}
|
||||
|
||||
impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
|
||||
impl<S, Req, Resp> tower::Service<http::Request<Req>> for ObservabilityLayerService<S>
|
||||
where
|
||||
S: tower::Service<http::Request<B>>,
|
||||
S: tower::Service<http::Request<Req>, Response = http::Response<Resp>> + Send,
|
||||
S::Future: Send + 'static,
|
||||
{
|
||||
type Response = S::Response;
|
||||
type Error = S::Error;
|
||||
type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;
|
||||
|
||||
fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
|
||||
fn call(&mut self, mut req: http::Request<Req>) -> Self::Future {
|
||||
// Record the request start time as a request extension.
|
||||
//
|
||||
// TODO: we should start a timer here instead, but it currently requires a timeline handle
|
||||
// and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
|
||||
req.extensions_mut().insert(ReceivedAt(Instant::now()));
|
||||
|
||||
// Create a basic tracing span. Enter the span for the current thread (to use it for inner
|
||||
// sync code like interceptors), and instrument the future (to use it for inner async code
|
||||
// like the page service itself).
|
||||
// Extract the peer address and gRPC method.
|
||||
let peer = req
|
||||
.extensions()
|
||||
.get::<TcpConnectInfo>()
|
||||
.and_then(|info| info.remote_addr())
|
||||
.map(|addr| addr.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let method = req
|
||||
.uri()
|
||||
.path()
|
||||
.split('/')
|
||||
.nth(2)
|
||||
.unwrap_or(req.uri().path())
|
||||
.to_string();
|
||||
|
||||
// Create a basic tracing span.
|
||||
//
|
||||
// The instrument() call below is not sufficient. It only affects the returned future, and
|
||||
// only takes effect when the caller polls it. Any sync code executed when we call
|
||||
// self.inner.call() below (such as interceptors) runs outside of the returned future, and
|
||||
// is not affected by it. We therefore have to enter the span on the current thread too.
|
||||
// Enter the span for the current thread and instrument the future. It is not sufficient to
|
||||
// only instrument the future, since it only takes effect after the future is returned and
|
||||
// polled, not when the inner service is called below (e.g. during interceptor execution).
|
||||
let span = info_span!(
|
||||
"grpc:pageservice",
|
||||
// Set by TenantMetadataInterceptor.
|
||||
// These will be populated by TenantMetadataInterceptor.
|
||||
tenant_id = field::Empty,
|
||||
timeline_id = field::Empty,
|
||||
shard_id = field::Empty,
|
||||
// NB: empty fields must be listed first above. Otherwise, the field names will be
|
||||
// clobbered when the empty fields are populated. They will be output last regardless.
|
||||
%peer,
|
||||
%method,
|
||||
);
|
||||
let _guard = span.enter();
|
||||
|
||||
Box::pin(self.inner.call(req).instrument(span.clone()))
|
||||
// Construct a future for calling the inner service, but don't await it. This avoids having
|
||||
// to clone the inner service into the future below.
|
||||
let call = self.inner.call(req);
|
||||
|
||||
async move {
|
||||
// Await the inner service call.
|
||||
let result = call.await;
|
||||
|
||||
// Log gRPC error statuses. This won't include request info from handler spans, but it
|
||||
// will catch all errors (even those emitted before handler spans are constructed). Only
|
||||
// unary request errors are logged here, not streaming response errors.
|
||||
if let Ok(ref resp) = result
|
||||
&& let Some(status) = tonic::Status::from_header_map(resp.headers())
|
||||
&& status.code() != tonic::Code::Ok
|
||||
{
|
||||
// TODO: it would be nice if we could propagate the handler span's request fields
|
||||
// here. This could e.g. be done by attaching the request fields to
|
||||
// tonic::Status::metadata via a proc macro.
|
||||
warn!(
|
||||
"request failed with {:?}: {}",
|
||||
status.code(),
|
||||
status.message()
|
||||
);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
.instrument(span.clone())
|
||||
.boxed()
|
||||
}
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
|
||||
|
||||
@@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl CollectKeySpaceError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(_) => false,
|
||||
CollectKeySpaceError::PageRead(e) => e.is_cancel(),
|
||||
CollectKeySpaceError::Cancelled => true,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::Cancelled => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
fn from(err: PageReconstructError) -> Self {
|
||||
match err {
|
||||
|
||||
@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{self, RelSizeMigration};
|
||||
use pageserver_api::models::{
|
||||
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
WalRedoManagerStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
|
||||
@@ -142,6 +142,9 @@ mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod debug;
|
||||
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -180,6 +183,7 @@ pub(super) struct AttachedTenantConf {
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: pageserver_api::models::TenantConfig,
|
||||
location: AttachedLocationConfig,
|
||||
) -> Self {
|
||||
@@ -191,9 +195,7 @@ impl AttachedTenantConf {
|
||||
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
|
||||
Some(
|
||||
tokio::time::Instant::now()
|
||||
+ tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(LsnLease::DEFAULT_LENGTH),
|
||||
+ TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
|
||||
)
|
||||
} else {
|
||||
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
|
||||
@@ -208,10 +210,13 @@ impl AttachedTenantConf {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
fn try_from(
|
||||
conf: &'static PageServerConf,
|
||||
location_conf: LocationConf,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
|
||||
Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!(
|
||||
@@ -386,7 +391,7 @@ pub struct TenantShard {
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) feature_resolver: TenantFeatureResolver,
|
||||
pub(crate) feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
impl std::fmt::Debug for TenantShard {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
@@ -3286,7 +3291,9 @@ impl TenantShard {
|
||||
// Ignore this, we likely raced with unarchival.
|
||||
OffloadError::NotArchived => Ok(()),
|
||||
OffloadError::AlreadyInProgress => Ok(()),
|
||||
err => Err(err),
|
||||
OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
|
||||
// don't break the anyhow chain
|
||||
OffloadError::Other(err) => Err(CompactionError::Other(err)),
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -3314,27 +3321,13 @@ impl TenantShard {
|
||||
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
match err {
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::AlreadyRunning(_) => {}
|
||||
if err.is_cancel() {
|
||||
return;
|
||||
}
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
@@ -3411,7 +3404,7 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
// Update the feature resolver with the latest tenant-spcific data.
|
||||
self.feature_resolver.update_cached_tenant_properties(self);
|
||||
self.feature_resolver.refresh_properties_and_flags(self);
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
@@ -4178,6 +4171,15 @@ impl TenantShard {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
// HADRON
|
||||
pub fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf.image_layer_force_creation_period.or(self
|
||||
.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_force_creation_period)
|
||||
}
|
||||
|
||||
pub fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -4205,10 +4207,16 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length_impl(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: &pageserver_api::models::TenantConfig,
|
||||
) -> Duration {
|
||||
tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
.unwrap_or(conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub fn get_timeline_offloading_enabled(&self) -> bool {
|
||||
@@ -4494,10 +4502,10 @@ impl TenantShard {
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_cache,
|
||||
feature_resolver: TenantFeatureResolver::new(
|
||||
feature_resolver: Arc::new(TenantFeatureResolver::new(
|
||||
feature_resolver,
|
||||
tenant_shard_id.tenant_id,
|
||||
),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6009,22 +6017,24 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn do_try_load(
|
||||
pub(crate) async fn do_try_load_with_redo(
|
||||
&self,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
))
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
),
|
||||
)
|
||||
.unwrap(),
|
||||
self.shard_identity,
|
||||
Some(walredo_mgr),
|
||||
@@ -6049,6 +6059,14 @@ pub(crate) mod harness {
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
pub(crate) async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
self.do_try_load_with_redo(walredo_mgr, ctx).await
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||
self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
|
||||
}
|
||||
@@ -6125,7 +6143,7 @@ mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::SeedableRng;
|
||||
@@ -6675,17 +6693,13 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -9384,17 +9398,21 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
|
||||
.await
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
// set a non-zero lease length to test the feature
|
||||
tenant
|
||||
.update_tenant_config(|mut conf| {
|
||||
conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH);
|
||||
Ok(conf)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
|
||||
366
pageserver/src/tenant/debug.rs
Normal file
366
pageserver/src/tenant/debug.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
use std::{ops::Range, str::FromStr, sync::Arc};
|
||||
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::KeySpace,
|
||||
shard::{ShardIdentity, ShardStripeSize},
|
||||
};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn};
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardIndex, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::ValueReconstructState,
|
||||
walredo::harness::RedoHarness,
|
||||
};
|
||||
|
||||
use super::{
|
||||
WalRedoManager, WalredoManagerId,
|
||||
harness::TenantHarness,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState},
|
||||
};
|
||||
|
||||
fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes {
|
||||
// To match the logic in libs/wal_decoder/src/serialized_batch.rs
|
||||
let mut new_image: BytesMut = img_bytes.into();
|
||||
if is_fpw && !page_is_new(&new_image) {
|
||||
page_set_lsn(&mut new_image, next_record_lsn);
|
||||
}
|
||||
assert_eq!(new_image.len(), BLCKSZ as usize);
|
||||
new_image.freeze()
|
||||
}
|
||||
|
||||
async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
let contents = tokio::fs::read_to_string(input)
|
||||
.await
|
||||
.map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}")))
|
||||
.unwrap();
|
||||
let lines = contents.lines();
|
||||
let mut last_wal_lsn: Option<Lsn> = None;
|
||||
let state = {
|
||||
let mut state = ValueReconstructState::default();
|
||||
let mut is_fpw = false;
|
||||
let mut is_first_line = true;
|
||||
for line in lines {
|
||||
if is_first_line {
|
||||
is_first_line = false;
|
||||
if line.trim() == "FPW" {
|
||||
is_fpw = true;
|
||||
}
|
||||
continue; // Skip the first line.
|
||||
}
|
||||
// Each input line is in the "<next_record_lsn>,<base64>" format.
|
||||
let (lsn_str, payload_b64) = line
|
||||
.split_once(',')
|
||||
.expect("Invalid input format: expected '<lsn>,<base64>'");
|
||||
|
||||
// Parse the LSN and decode the payload.
|
||||
let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format");
|
||||
let bytes = Bytes::from(
|
||||
STANDARD
|
||||
.decode(payload_b64.trim())
|
||||
.expect("Invalid base64 payload"),
|
||||
);
|
||||
|
||||
// The first line is considered the base image, the rest are WAL records.
|
||||
if state.img.is_none() {
|
||||
state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes)));
|
||||
} else {
|
||||
let wal_record = NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: bytes,
|
||||
};
|
||||
state.records.push((lsn, wal_record));
|
||||
last_wal_lsn.replace(lsn);
|
||||
}
|
||||
}
|
||||
state
|
||||
};
|
||||
|
||||
assert!(state.img.is_some(), "No base image found");
|
||||
assert!(!state.records.is_empty(), "No WAL records found");
|
||||
let result = timeline
|
||||
.reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
|
||||
eprintln!("final image: {:?}", STANDARD.encode(result));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn search_key(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
dir: String,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(4),
|
||||
};
|
||||
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::new(
|
||||
shard_index.shard_number,
|
||||
shard_index.shard_count,
|
||||
ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut delta_layers: Vec<Layer> = Vec::new();
|
||||
let mut img_layer: Option<Layer> = Option::None;
|
||||
let mut dir = tokio::fs::read_dir(dir).await?;
|
||||
loop {
|
||||
let entry = dir.next_entry().await?;
|
||||
if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() {
|
||||
break;
|
||||
}
|
||||
let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap();
|
||||
let layer_name = match LayerName::from_str(path.file_name().unwrap()) {
|
||||
Ok(name) => name,
|
||||
Err(_) => {
|
||||
eprintln!("Skipped invalid layer: {path}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let layer = Layer::for_resident(
|
||||
tenant.conf,
|
||||
&timeline,
|
||||
path.clone(),
|
||||
layer_name,
|
||||
LayerFileMetadata::new(
|
||||
tokio::fs::metadata(path.clone()).await?.len(),
|
||||
Generation::new(1),
|
||||
shard_index,
|
||||
),
|
||||
);
|
||||
if layer.layer_desc().is_delta() {
|
||||
delta_layers.push(layer.into());
|
||||
} else if img_layer.is_none() {
|
||||
img_layer = Some(layer.into());
|
||||
} else {
|
||||
anyhow::bail!("Found multiple image layers");
|
||||
}
|
||||
}
|
||||
// sort delta layers based on the descending order of LSN
|
||||
delta_layers.sort_by(|a, b| {
|
||||
b.layer_desc()
|
||||
.get_lsn_range()
|
||||
.start
|
||||
.cmp(&a.layer_desc().get_lsn_range().start)
|
||||
});
|
||||
|
||||
let mut state = ValuesReconstructState::new(IoConcurrency::Sequential);
|
||||
|
||||
let key_space = KeySpace::single(Range {
|
||||
start: key,
|
||||
end: key.next(),
|
||||
});
|
||||
let lsn_range = Range {
|
||||
start: img_layer
|
||||
.as_ref()
|
||||
.map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()),
|
||||
end: lsn,
|
||||
};
|
||||
for delta_layer in delta_layers.iter() {
|
||||
delta_layer
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
img_layer
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, result) in std::mem::take(&mut state.keys) {
|
||||
let state = result.collect_pending_ios().await?;
|
||||
if state.img.is_some() {
|
||||
eprintln!(
|
||||
"image: {}: {:x?}",
|
||||
state.img.as_ref().unwrap().0,
|
||||
STANDARD.encode(state.img.as_ref().unwrap().1.clone())
|
||||
);
|
||||
}
|
||||
for delta in state.records.iter() {
|
||||
match &delta.1 {
|
||||
NeonWalRecord::Postgres { will_init, rec } => {
|
||||
eprintln!(
|
||||
"delta: {}: will_init: {}, {:x?}",
|
||||
delta.0,
|
||||
will_init,
|
||||
STANDARD.encode(rec)
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
eprintln!("delta: {}: {:x?}", delta.0, delta.1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let result = timeline
|
||||
.reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
eprintln!("final image: {lsn} : {result:?}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Redo all WALs against the base image in the input file. Return the base64 encoded final image.
|
||||
/// Each line in the input file must be in the form "<lsn>,<base64>" where:
|
||||
/// * `<lsn>` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`.
|
||||
/// * `<base64>` is the base64‐encoded page image (first line) or WAL record (subsequent lines).
|
||||
///
|
||||
/// The first line provides the base image of a page. The LSN is the LSN of "next record" following
|
||||
/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping
|
||||
/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200.
|
||||
///
|
||||
/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the
|
||||
/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here
|
||||
/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should
|
||||
/// be 0/1.
|
||||
#[derive(Parser)]
|
||||
struct RedoWalsCmd {
|
||||
#[clap(long)]
|
||||
input: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redo_wals() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match RedoWalsCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
redo_wals(&cmd.input, key).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Search for a page at the given LSN in all layers of the data_dir.
|
||||
/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image.
|
||||
#[derive(Parser)]
|
||||
struct SearchKeyCmd {
|
||||
#[clap(long)]
|
||||
tenant_id: String,
|
||||
#[clap(long)]
|
||||
timeline_id: String,
|
||||
#[clap(long)]
|
||||
data_dir: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
#[clap(long)]
|
||||
lsn: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_search_key() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match SearchKeyCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap();
|
||||
let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap();
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
let lsn = Lsn::from_str(&cmd.lsn).unwrap();
|
||||
search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -43,7 +43,7 @@ use crate::controller_upcall_client::{
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
@@ -538,6 +538,21 @@ pub async fn init_tenant_mgr(
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?;
|
||||
|
||||
// Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants.
|
||||
// This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node
|
||||
// so the Storage Controller has not had the time to move tenants out.
|
||||
let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes {
|
||||
tenant_configs.is_empty() && !tenant_modes.is_empty()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if data_loss_suspected {
|
||||
tracing::error!(
|
||||
"Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants"
|
||||
);
|
||||
}
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 });
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
tenant_configs.len(),
|
||||
@@ -664,7 +679,7 @@ pub async fn init_tenant_mgr(
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
@@ -842,8 +857,11 @@ impl TenantManager {
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
new_location_config.clone(),
|
||||
)
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
@@ -1046,7 +1064,7 @@ impl TenantManager {
|
||||
// Testing hack: if we are configured with no control plane, then drop the generation
|
||||
// from upserts. This enables creating generation-less tenants even though neon_local
|
||||
// always uses generations when calling the location conf API.
|
||||
let attached_conf = AttachedTenantConf::try_from(new_location_config)
|
||||
let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?;
|
||||
|
||||
let tenant = tenant_spawn(
|
||||
@@ -1250,7 +1268,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
AttachedTenantConf::try_from(self.conf, config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
@@ -2131,7 +2149,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
|
||||
AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
|
||||
@@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
match storage {
|
||||
// Pass the file path as a storage metadata to minimize changes to neon.
|
||||
// Otherwise, we need to change the upload interface.
|
||||
GenericRemoteStorage::AzureBlob(s) => {
|
||||
let block_size_mb = s.put_block_size_mb.unwrap_or(0);
|
||||
if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 {
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
local_path.as_str(),
|
||||
)]));
|
||||
}
|
||||
}
|
||||
GenericRemoteStorage::LocalFs(_) => {}
|
||||
GenericRemoteStorage::AwsS3(_) => {}
|
||||
GenericRemoteStorage::Unreliable(_) => {}
|
||||
};
|
||||
/* END_HADRON */
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.upload(reader, fs_size, remote_path, metadata, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
@@ -17,23 +17,35 @@ use tracing::*;
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
use utils::completion::Barrier;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
|
||||
use crate::tenant::blob_io::WriteBlobError;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::{TenantShard, TenantState};
|
||||
use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
|
||||
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
|
||||
/*BEGIN_HADRON*/
|
||||
// ideally we should run at least one compaction task per tenant in order to (1) maximize
|
||||
// compaction throughput (2) avoid head-of-line blocking of large compactions. However doing
|
||||
// that may create too many compaction tasks with lots of memory overheads. So we limit the
|
||||
// number of compaction tasks based on the available CPU core count.
|
||||
// Need to revisit.
|
||||
// let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD")
|
||||
// .ok()
|
||||
// .and_then(|s| s.parse().ok())
|
||||
// .unwrap_or(4);
|
||||
// let permits = usize::max(1, total_threads * tasks_per_thread);
|
||||
// // assert!(permits < total_threads, "need threads for other work");
|
||||
/*END_HADRON*/
|
||||
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
@@ -295,48 +307,12 @@ pub(crate) fn log_compaction_error(
|
||||
task_cancelled: bool,
|
||||
degrade_to_warning: bool,
|
||||
) {
|
||||
use CompactionError::*;
|
||||
let is_cancel = err.is_cancel();
|
||||
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let is_stopping = upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed;
|
||||
|
||||
if is_stopping {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
}
|
||||
}
|
||||
let level = if is_cancel || task_cancelled {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
};
|
||||
|
||||
if let Some((error_count, sleep_duration)) = retry_info {
|
||||
|
||||
@@ -40,7 +40,6 @@ use layer_manager::{
|
||||
Shutdown,
|
||||
};
|
||||
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
use pageserver_api::key::{
|
||||
@@ -119,7 +118,6 @@ use crate::pgdatadir_mapping::{
|
||||
MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
@@ -202,7 +200,7 @@ pub struct TimelineResources {
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: TenantFeatureResolver,
|
||||
pub feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -353,6 +351,13 @@ pub struct Timeline {
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
// HADRON
|
||||
/// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
|
||||
/// on this key range.
|
||||
force_image_creation_lsn: AtomicLsn,
|
||||
/// The last time instant when force_image_creation_lsn is computed.
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -450,7 +455,7 @@ pub struct Timeline {
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
feature_resolver: TenantFeatureResolver,
|
||||
feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
@@ -587,6 +592,28 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
PageReconstructError::Other(_) => false,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(),
|
||||
PageReconstructError::Cancelled => true,
|
||||
PageReconstructError::WalRedo(_) => false,
|
||||
PageReconstructError::MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
#[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
PageReconstructError::Other(e) => e,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(),
|
||||
PageReconstructError::Cancelled => anyhow::Error::new(self),
|
||||
PageReconstructError::WalRedo(e) => e,
|
||||
PageReconstructError::MissingKey(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
// with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
|
||||
@@ -740,17 +767,6 @@ impl std::fmt::Display for MissingKeyError {
|
||||
}
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Cancelled => true,
|
||||
Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
@@ -953,13 +969,35 @@ pub enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
impl WaitLsnError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => true,
|
||||
WaitLsnError::BadState(timeline_state) => match timeline_state {
|
||||
TimelineState::Loading => false,
|
||||
TimelineState::Active => false,
|
||||
TimelineState::Stopping => true,
|
||||
TimelineState::Broken { .. } => false,
|
||||
},
|
||||
WaitLsnError::Timeout(_) => false,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => anyhow::Error::new(self),
|
||||
WaitLsnError::BadState(_) => anyhow::Error::new(self),
|
||||
WaitLsnError::Timeout(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for tonic::Status {
|
||||
fn from(err: WaitLsnError) -> Self {
|
||||
use tonic::Code;
|
||||
let code = match &err {
|
||||
WaitLsnError::Timeout(_) => Code::Internal,
|
||||
WaitLsnError::BadState(_) => Code::Internal,
|
||||
WaitLsnError::Shutdown => Code::Unavailable,
|
||||
let code = if err.is_cancel() {
|
||||
Code::Unavailable
|
||||
} else {
|
||||
Code::Internal
|
||||
};
|
||||
tonic::Status::new(code, err.to_string())
|
||||
}
|
||||
@@ -971,7 +1009,7 @@ impl From<WaitLsnError> for tonic::Status {
|
||||
impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
|
||||
CreateImageLayersError::Other(e) => {
|
||||
CompactionError::Other(e.context("create image layers"))
|
||||
}
|
||||
@@ -1086,6 +1124,26 @@ enum ImageLayerCreationOutcome {
|
||||
Skip,
|
||||
}
|
||||
|
||||
enum RepartitionError {
|
||||
Other(anyhow::Error),
|
||||
CollectKeyspace(CollectKeySpaceError),
|
||||
}
|
||||
|
||||
impl RepartitionError {
|
||||
fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
RepartitionError::Other(_) => false,
|
||||
RepartitionError::CollectKeyspace(e) => e.is_cancel(),
|
||||
}
|
||||
}
|
||||
fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
RepartitionError::Other(e) => e,
|
||||
RepartitionError::CollectKeyspace(e) => e.into_anyhow(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -1772,30 +1830,31 @@ impl Timeline {
|
||||
existing_lease.clone()
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
// Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted.
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// We allow create lease for those below the planned gc cutoff if we are still within the grace period
|
||||
// of GC blocking.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
conf.location.attach_mode == AttachmentMode::Single
|
||||
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
!conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
if lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
// Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines
|
||||
// whether it is a initial lease creation or a renewal.
|
||||
if (init || validate) && lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
|
||||
let dt: DateTime<Utc> = valid_until.into();
|
||||
@@ -2065,22 +2124,7 @@ impl Timeline {
|
||||
match &result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(e) if e.is_cancel() => {}
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
// Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
// abruptly stall nor resume L0 flushes in these cases.
|
||||
Err(CompactionError::Offload(_)) => {}
|
||||
Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
|
||||
};
|
||||
|
||||
result
|
||||
@@ -2144,14 +2188,31 @@ impl Timeline {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// or not, stop ingesting any more data.
|
||||
// cancel walreceiver to stop ingesting more data asap.
|
||||
//
|
||||
// Note that we're accepting a race condition here where we may
|
||||
// do the final flush below, before walreceiver observes the
|
||||
// cancellation and exits.
|
||||
// This means we may open a new InMemoryLayer after the final flush below.
|
||||
// Flush loop is also still running for a short while, so, in theory, it
|
||||
// could also make its way into the upload queue.
|
||||
//
|
||||
// If we wait for the shutdown of the walreceiver before moving on to the
|
||||
// flush, then that would be avoided. But we don't do it because the
|
||||
// walreceiver entertains reads internally, which means that it possibly
|
||||
// depends on the download of layers. Layer download is only sensitive to
|
||||
// the cancellation of the entire timeline, so cancelling the walreceiver
|
||||
// will have no effect on the individual get requests.
|
||||
// This would cause problems when there is a lot of ongoing downloads or
|
||||
// there is S3 unavailabilities, i.e. detach, deletion, etc would hang,
|
||||
// and we can't deallocate resources of the timeline, etc.
|
||||
let walreceiver = self.walreceiver.lock().unwrap().take();
|
||||
tracing::debug!(
|
||||
is_some = walreceiver.is_some(),
|
||||
"Waiting for WalReceiverManager..."
|
||||
);
|
||||
if let Some(walreceiver) = walreceiver {
|
||||
walreceiver.shutdown().await;
|
||||
walreceiver.cancel().await;
|
||||
}
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
self.last_record_lsn.shutdown();
|
||||
@@ -2792,6 +2853,18 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
// HADRON
|
||||
fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_layer_force_creation_period
|
||||
.or(self
|
||||
.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_force_creation_period)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -3061,7 +3134,9 @@ impl Timeline {
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
last_image_layer_creation_check_instant: Mutex::new(None),
|
||||
|
||||
// HADRON
|
||||
force_image_creation_lsn: AtomicLsn::new(0),
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_latest_cache: RwLock::new(HashMap::new()),
|
||||
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
|
||||
@@ -3112,7 +3187,7 @@ impl Timeline {
|
||||
|
||||
basebackup_cache: resources.basebackup_cache,
|
||||
|
||||
feature_resolver: resources.feature_resolver,
|
||||
feature_resolver: resources.feature_resolver.clone(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -4953,7 +5028,7 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -4982,6 +5057,7 @@ impl Timeline {
|
||||
.create_image_layers(
|
||||
&partitions,
|
||||
self.initdb_lsn,
|
||||
None,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
LastImageLayerCreationStatus::Initial,
|
||||
@@ -5203,18 +5279,18 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
)));
|
||||
}
|
||||
@@ -5235,7 +5311,10 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.await
|
||||
.map_err(RepartitionError::CollectKeyspace)?;
|
||||
let dense_partitioning = dense_ks.partition(
|
||||
&self.shard_identity,
|
||||
partition_size,
|
||||
@@ -5250,14 +5329,19 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition? True if we want to generate.
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
async fn time_for_new_image_layer(
|
||||
&self,
|
||||
partition: &KeySpace,
|
||||
lsn: Lsn,
|
||||
force_image_creation_lsn: Option<Lsn>,
|
||||
) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let Ok(layers) = guard.layer_map() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let mut min_image_lsn: Lsn = Lsn::MAX;
|
||||
let mut max_deltas = 0;
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn);
|
||||
@@ -5292,9 +5376,22 @@ impl Timeline {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
min_image_lsn = min(min_image_lsn, img_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
// HADRON
|
||||
if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
|
||||
info!(
|
||||
"forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
|
||||
partition.ranges[0].start,
|
||||
partition.ranges[0].end,
|
||||
min_image_lsn,
|
||||
force_image_creation_lsn.unwrap()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
debug!(
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
@@ -5520,7 +5617,7 @@ impl Timeline {
|
||||
/// suffer from the lack of image layers
|
||||
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
|
||||
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
|
||||
let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
|
||||
|
||||
let last_checks_at = self.last_image_layer_creation_check_at.load();
|
||||
let distance = lsn
|
||||
@@ -5534,12 +5631,12 @@ impl Timeline {
|
||||
let mut time_based_decision = false;
|
||||
let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
|
||||
if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
|
||||
let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
|
||||
{
|
||||
self.get_checkpoint_timeout()
|
||||
} else {
|
||||
Duration::from_secs(3600 * 48)
|
||||
};
|
||||
let check_required_after =
|
||||
if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
|
||||
self.get_checkpoint_timeout()
|
||||
} else {
|
||||
Duration::from_secs(3600 * 48)
|
||||
};
|
||||
|
||||
time_based_decision = match *last_check_instant {
|
||||
Some(last_check) => {
|
||||
@@ -5567,10 +5664,12 @@ impl Timeline {
|
||||
/// true = we have generate all image layers, false = we preempt the process for L0 compaction.
|
||||
///
|
||||
/// `partition_mode` is only for logging purpose and is not used anywhere in this function.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force_image_creation_lsn: Option<Lsn>,
|
||||
mode: ImageLayerCreationMode,
|
||||
ctx: &RequestContext,
|
||||
last_status: LastImageLayerCreationStatus,
|
||||
@@ -5674,7 +5773,11 @@ impl Timeline {
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
|
||||
if !check_for_image_layers
|
||||
|| !self
|
||||
.time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
|
||||
.await
|
||||
{
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
@@ -5995,57 +6098,88 @@ impl Drop for Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CompactionError {
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
ShuttingDown,
|
||||
/// Compaction tried to offload a timeline and failed
|
||||
#[error("Failed to offload timeline: {0}")]
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error("Failed to collect keyspace: {0}")]
|
||||
CollectKeySpaceError(#[from] CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error("Compaction already running: {0}")]
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
pub(crate) use compaction_error::CompactionError;
|
||||
/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
|
||||
/// instead of `match`ing on [`CompactionError::ShuttingDown`].
|
||||
mod compaction_error {
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
impl CompactionError {
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
))
|
||||
| Self::Offload(OffloadError::Cancelled)
|
||||
)
|
||||
use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
|
||||
virtual_file::owned_buffers_io::write::FlushTaskError,
|
||||
};
|
||||
|
||||
/// Top-level failure to compact. Use [`Self::is_cancel`].
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CompactionError {
|
||||
/// Use [`Self::is_cancel`] instead of checking for this variant.
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
#[allow(private_interfaces)]
|
||||
ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Critical errors that indicate data corruption.
|
||||
pub fn is_critical(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
#[derive(Debug)]
|
||||
struct ForbidMatching;
|
||||
|
||||
impl From<OffloadError> for CompactionError {
|
||||
fn from(e: OffloadError) -> Self {
|
||||
match e {
|
||||
OffloadError::Cancelled => Self::ShuttingDown,
|
||||
_ => Self::Offload(e),
|
||||
impl CompactionError {
|
||||
pub fn new_cancelled() -> Self {
|
||||
Self::ShuttingDown(ForbidMatching)
|
||||
}
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
let other = match self {
|
||||
CompactionError::ShuttingDown(_) => return true,
|
||||
CompactionError::Other(other) => other,
|
||||
};
|
||||
|
||||
// The write path of compaction in particular often lacks differentiated
|
||||
// handling errors stemming from cancellation from other errors.
|
||||
// So, if requested, we also check the ::Other variant by downcasting.
|
||||
// The list below has been found empirically from flaky tests and production logs.
|
||||
// The process is simple: on ::Other(), compaction will print the enclosed
|
||||
// anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
|
||||
// line where the write path / compaction code does undifferentiated error handling
|
||||
// from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
|
||||
// below, following the same is_cancel() pattern.
|
||||
|
||||
let root_cause = other.root_cause();
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed
|
||||
}
|
||||
pub fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
|
||||
CompactionError::Other(e) => e,
|
||||
}
|
||||
}
|
||||
pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
|
||||
if err.is_cancel() {
|
||||
Self::new_cancelled()
|
||||
} else {
|
||||
Self::Other(err.into_anyhow())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6057,7 +6191,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
|
||||
CompactionError::Other(anyhow::anyhow!(value))
|
||||
}
|
||||
super::upload_queue::NotInitialized::ShuttingDown
|
||||
| super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
|
||||
| super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6067,7 +6201,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
|
||||
match e {
|
||||
super::storage_layer::layer::DownloadError::TimelineShutdown
|
||||
| super::storage_layer::layer::DownloadError::DownloadCancelled => {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
}
|
||||
super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
|
||||
| super::storage_layer::layer::DownloadError::DownloadRequired
|
||||
@@ -6086,14 +6220,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
|
||||
|
||||
impl From<layer_manager::Shutdown> for CompactionError {
|
||||
fn from(_: layer_manager::Shutdown) -> Self {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<super::storage_layer::errors::PutError> for CompactionError {
|
||||
fn from(e: super::storage_layer::errors::PutError) -> Self {
|
||||
if e.is_cancel() {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
} else {
|
||||
CompactionError::Other(e.into_anyhow())
|
||||
}
|
||||
@@ -6192,7 +6326,7 @@ impl Timeline {
|
||||
let mut guard = tokio::select! {
|
||||
guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -6748,7 +6882,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
async fn reconstruct_value(
|
||||
pub(crate) async fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use super::layer_manager::LayerManagerLockHolder;
|
||||
use super::{
|
||||
@@ -16,7 +17,8 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::tenant::timeline::{DeltaEntry, RepartitionError};
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
@@ -32,6 +34,7 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
|
||||
use pageserver_compaction::interface::*;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -44,6 +47,7 @@ use wal_decoder::models::value::Value;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
@@ -64,7 +68,7 @@ use crate::tenant::timeline::{
|
||||
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
|
||||
ResidentLayer, drop_layer_manager_rlock,
|
||||
};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
@@ -571,7 +575,7 @@ impl GcCompactionQueue {
|
||||
}
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
|
||||
Err(e) if e.is_cancel() => Err(e),
|
||||
Err(_) => {
|
||||
// There are some cases where traditional gc might collect some layer
|
||||
// files causing gc-compaction cannot read the full history of the key.
|
||||
@@ -591,9 +595,9 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
|
||||
return Err(CompactionError::AlreadyRunning(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
|
||||
));
|
||||
return Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
|
||||
)));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
@@ -1259,13 +1263,19 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// HADRON
|
||||
let force_image_creation_lsn = self
|
||||
.get_or_compute_force_image_creation_lsn(cancel, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
// 1. L0 Compact
|
||||
let l0_outcome = {
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
@@ -1273,6 +1283,7 @@ impl Timeline {
|
||||
.compact_level0(
|
||||
target_file_size,
|
||||
options.flags.contains(CompactFlags::ForceL0Compaction),
|
||||
force_image_creation_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1375,6 +1386,7 @@ impl Timeline {
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
force_image_creation_lsn,
|
||||
mode,
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
@@ -1417,22 +1429,33 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
//
|
||||
// Log other errors but continue. Failure to repartition is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
//
|
||||
// TODO:
|
||||
// 1. shouldn't we return early here if we observe cancellation
|
||||
// 2. Experiment: can we stop checking self.cancel here?
|
||||
Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch
|
||||
Err(err) if err.is_cancel() => {}
|
||||
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
Err(err) if err.is_critical() => {
|
||||
Err(RepartitionError::CollectKeyspace(
|
||||
e @ CollectKeySpaceError::Decode(_)
|
||||
| e @ CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
),
|
||||
)) => {
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
"could not compact, repartitioning keyspace failed: {e:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Log other errors. No partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
|
||||
Err(e) => error!(
|
||||
"could not compact, repartitioning keyspace failed: {:?}",
|
||||
e.into_anyhow()
|
||||
),
|
||||
};
|
||||
|
||||
let partition_count = self.partitioning.read().0.0.parts.len();
|
||||
@@ -1460,6 +1483,63 @@ impl Timeline {
|
||||
Ok(CompactionOutcome::Done)
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// Get the force image creation LSN. Compute it if the last computed LSN is too old.
|
||||
async fn get_or_compute_force_image_creation_lsn(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Lsn>> {
|
||||
const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
|
||||
let image_layer_force_creation_period = self.get_image_creation_timeout();
|
||||
if image_layer_force_creation_period.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
|
||||
let force_image_creation_lsn_computed_at =
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap();
|
||||
if force_image_creation_lsn_computed_at.is_none()
|
||||
|| force_image_creation_lsn_computed_at.unwrap().elapsed()
|
||||
> FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
|
||||
{
|
||||
let now: SystemTime = SystemTime::now();
|
||||
let timestamp = now
|
||||
.checked_sub(image_layer_force_creation_period)
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"image creation timeout is too large: {image_layer_force_creation_period:?}"
|
||||
)
|
||||
})?;
|
||||
let timestamp = to_pg_timestamp(timestamp);
|
||||
let force_image_creation_lsn = match self
|
||||
.find_lsn_for_timestamp(timestamp, cancel, ctx)
|
||||
.await?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
|
||||
_ => {
|
||||
let gc_lsn = *self.get_applied_gc_cutoff_lsn();
|
||||
tracing::info!(
|
||||
"no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
|
||||
gc_lsn
|
||||
);
|
||||
gc_lsn
|
||||
}
|
||||
};
|
||||
self.force_image_creation_lsn
|
||||
.store(force_image_creation_lsn);
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
|
||||
tracing::info!(
|
||||
"computed force image creation LSN: {}",
|
||||
force_image_creation_lsn
|
||||
);
|
||||
Ok(Some(force_image_creation_lsn))
|
||||
} else {
|
||||
Ok(Some(self.force_image_creation_lsn.load()))
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// Check for layers that are elegible to be rewritten:
|
||||
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
|
||||
/// we don't indefinitely retain keys in this shard that aren't needed.
|
||||
@@ -1612,7 +1692,7 @@ impl Timeline {
|
||||
|
||||
for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
|
||||
@@ -1710,7 +1790,7 @@ impl Timeline {
|
||||
Ok(()) => {},
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
},
|
||||
// Don't wait if there's L0 compaction to do. We don't need to update the outcome
|
||||
@@ -1789,6 +1869,7 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
force_compaction_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let CompactLevel0Phase1Result {
|
||||
@@ -1809,6 +1890,7 @@ impl Timeline {
|
||||
stats,
|
||||
target_file_size,
|
||||
force_compaction_ignore_threshold,
|
||||
force_compaction_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(phase1_span)
|
||||
@@ -1831,6 +1913,7 @@ impl Timeline {
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
force_compaction_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
let begin = tokio::time::Instant::now();
|
||||
@@ -1860,11 +1943,28 @@ impl Timeline {
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
// HADRON
|
||||
let min_lsn = level0_deltas
|
||||
.iter()
|
||||
.map(|a| a.get_lsn_range().start)
|
||||
.reduce(min);
|
||||
if force_compaction_lsn.is_some()
|
||||
&& min_lsn.is_some()
|
||||
&& min_lsn.unwrap() < force_compaction_lsn.unwrap()
|
||||
{
|
||||
info!(
|
||||
"forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
|
||||
level0_deltas.len(),
|
||||
min_lsn.unwrap(),
|
||||
force_compaction_lsn.unwrap()
|
||||
);
|
||||
} else {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1973,7 +2073,7 @@ impl Timeline {
|
||||
let mut all_keys = Vec::new();
|
||||
for l in deltas_to_compact.iter() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
let keys = delta
|
||||
@@ -2066,7 +2166,7 @@ impl Timeline {
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
|
||||
@@ -2174,7 +2274,7 @@ impl Timeline {
|
||||
// avoid hitting the cancellation token on every key. in benches, we end up
|
||||
// shuffling an order of million keys per layer, this means we'll check it
|
||||
// around tens of times per layer.
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let same_key = prev_key == Some(key);
|
||||
@@ -2259,7 +2359,7 @@ impl Timeline {
|
||||
if writer.is_none() {
|
||||
if self.cancel.is_cancelled() {
|
||||
// to be somewhat responsive to cancellation, check for each new layer
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(
|
||||
@@ -2515,10 +2615,13 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let (dense_ks, _sparse_ks) = self
|
||||
.collect_keyspace(end_lsn, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::from_collect_keyspace)?;
|
||||
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
|
||||
|
||||
@@ -3174,7 +3277,7 @@ impl Timeline {
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
|
||||
_ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3447,7 +3550,7 @@ impl Timeline {
|
||||
}
|
||||
total_layer_size += layer.layer_desc().file_size;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
let should_yield = yield_for_l0
|
||||
&& self
|
||||
@@ -3594,7 +3697,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let should_yield = yield_for_l0
|
||||
|
||||
@@ -212,8 +212,12 @@
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::{HashMap, hash_map};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
use std::time::Duration;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::{instrument, trace};
|
||||
@@ -333,6 +337,44 @@ enum RoutingResult<T: Types> {
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/* BEGIN_HADRON */
|
||||
/// A wrapper of do_get to resolve the tenant shard for a get page request.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
const GET_MAX_RETRIES: usize = 10;
|
||||
const RETRY_BACKOFF: Duration = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self
|
||||
.do_get(timeline_id, shard_selector, tenant_manager)
|
||||
.await
|
||||
{
|
||||
Ok(handle) => return Ok(handle),
|
||||
Err(e) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tracing::warn!(
|
||||
"Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
|
||||
attempt,
|
||||
e
|
||||
);
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
continue;
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
@@ -341,7 +383,7 @@ impl<T: Types> Cache<T> {
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
async fn do_get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
@@ -879,6 +921,7 @@ mod tests {
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
|
||||
@@ -17,8 +17,6 @@ pub(crate) enum OffloadError {
|
||||
Cancelled,
|
||||
#[error("Timeline is not archived")]
|
||||
NotArchived,
|
||||
#[error(transparent)]
|
||||
RemoteStorage(anyhow::Error),
|
||||
#[error("Offload or deletion already in progress")]
|
||||
AlreadyInProgress,
|
||||
#[error("Unexpected offload error: {0}")]
|
||||
@@ -29,7 +27,7 @@ impl From<TenantManifestError> for OffloadError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +63,6 @@ pub struct WalReceiver {
|
||||
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
|
||||
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
|
||||
cancel: CancellationToken,
|
||||
task: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl WalReceiver {
|
||||
@@ -80,7 +79,7 @@ impl WalReceiver {
|
||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||
let manager_status = Arc::clone(&loop_status);
|
||||
let cancel = timeline.cancel.child_token();
|
||||
let task = WALRECEIVER_RUNTIME.spawn({
|
||||
let _task = WALRECEIVER_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -121,25 +120,14 @@ impl WalReceiver {
|
||||
Self {
|
||||
manager_status,
|
||||
cancel,
|
||||
task,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, level = tracing::Level::DEBUG)]
|
||||
pub async fn shutdown(self) {
|
||||
pub async fn cancel(self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("cancelling walreceiver tasks");
|
||||
self.cancel.cancel();
|
||||
match self.task.await {
|
||||
Ok(()) => debug!("Shutdown success"),
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// already logged by panic hook
|
||||
}
|
||||
Err(je) => {
|
||||
error!("shutdown walreceiver task join error: {je}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
|
||||
@@ -100,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -156,7 +157,10 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Ok(Some(broker_update)) => {
|
||||
broker_reset_interval.reset();
|
||||
connection_manager_state.register_timeline_update(broker_update);
|
||||
},
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
@@ -178,6 +182,21 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// If we've not received any updates from the broker from a while, are waiting for WAL
|
||||
// and have no safekeeper connection or connection candidates, then it might be that
|
||||
// the broker subscription is wedged. Drop the currrent subscription and re-subscribe
|
||||
// with the goal of unblocking it.
|
||||
_ = broker_reset_interval.tick() => {
|
||||
let awaiting_lsn = wait_lsn_status.borrow().is_some();
|
||||
let no_candidates = connection_manager_state.wal_stream_candidates.is_empty();
|
||||
let no_connection = connection_manager_state.wal_connection.is_none();
|
||||
|
||||
if awaiting_lsn && no_candidates && no_connection {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
loop {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user