mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 15:10:44 +00:00
Compare commits
40 Commits
release-pr
...
ruslan/sub
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee7d8e4512 | ||
|
|
6549708b44 | ||
|
|
45631bf2e5 | ||
|
|
5dbca8c756 | ||
|
|
9f46ca5eb1 | ||
|
|
54da030a2d | ||
|
|
afa4e48071 | ||
|
|
b54872a4dc | ||
|
|
486829f875 | ||
|
|
4775aa3e01 | ||
|
|
1785f856b6 | ||
|
|
69b22b05da | ||
|
|
bf0007fa96 | ||
|
|
a9bbe7b00b | ||
|
|
7e3f64b309 | ||
|
|
9480d17de7 | ||
|
|
424004ec95 | ||
|
|
88d1a78260 | ||
|
|
8e544c7f99 | ||
|
|
4f49fc5b79 | ||
|
|
5461039c3f | ||
|
|
d6c36d103e | ||
|
|
fbb2416685 | ||
|
|
8072fae2fe | ||
|
|
3869d680f9 | ||
|
|
d3fa228d92 | ||
|
|
be6a259b85 | ||
|
|
af3ca24a5e | ||
|
|
8b44f5b479 | ||
|
|
d1445cf3eb | ||
|
|
67d3026fc4 | ||
|
|
09e62e9b98 | ||
|
|
e121da4bfc | ||
|
|
4a948c9781 | ||
|
|
b39f04ab99 | ||
|
|
6bd15908fb | ||
|
|
3e36d516c2 | ||
|
|
cc3af6f7dd | ||
|
|
5badc7a3fb | ||
|
|
3a73644308 |
@@ -33,7 +33,6 @@ workspace-members = [
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"json",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -7,7 +7,6 @@ self-hosted-runner:
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- unit-perf-aws-arm
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
239
.github/workflows/build-macos.yml
vendored
239
.github/workflows/build-macos.yml
vendored
@@ -32,14 +32,162 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
make-all:
|
||||
build-pgxn:
|
||||
if: |
|
||||
inputs.pg_versions != '[]' || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 30
|
||||
runs-on: macos-15
|
||||
strategy:
|
||||
matrix:
|
||||
postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout main repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set pg ${{ matrix.postgres-version }} for caching
|
||||
id: pg_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Cache postgres ${{ matrix.postgres-version }} build
|
||||
id: cache_pg
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
with:
|
||||
path: pg_install/${{ matrix.postgres-version }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init vendor/postgres-${{ matrix.postgres-version }}
|
||||
git submodule update --depth 1 --recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build Postgres ${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build Neon Pg Ext ${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: pg_install--${{ matrix.postgres-version }}
|
||||
path: pg_install/${{ matrix.postgres-version }}
|
||||
# The artifact is supposed to be used by the next job in the same workflow,
|
||||
# so there’s no need to store it for too long.
|
||||
retention-days: 1
|
||||
|
||||
build-walproposer-lib:
|
||||
if: |
|
||||
contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 30
|
||||
runs-on: macos-15
|
||||
needs: [build-pgxn]
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout main repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set pg v17 for caching
|
||||
id: pg_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Download "pg_install/v17" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v17
|
||||
path: pg_install/v17
|
||||
|
||||
# `actions/download-artifact` doesn't preserve permissions:
|
||||
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
|
||||
- name: Make pg_install/v*/bin/* executable
|
||||
run: |
|
||||
chmod +x pg_install/v*/bin/*
|
||||
|
||||
- name: Cache walproposer-lib
|
||||
id: cache_walproposer_lib
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
with:
|
||||
path: build/walproposer-lib
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Checkout submodule vendor/postgres-v17
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init vendor/postgres-v17
|
||||
git submodule update --depth 1 --recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build walproposer-lib (only for v17)
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run:
|
||||
make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1
|
||||
|
||||
- name: Upload "build/walproposer-lib" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: build--walproposer-lib
|
||||
path: build/walproposer-lib
|
||||
# The artifact is supposed to be used by the next job in the same workflow,
|
||||
# so there’s no need to store it for too long.
|
||||
retention-days: 1
|
||||
|
||||
cargo-build:
|
||||
if: |
|
||||
inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: 30
|
||||
runs-on: macos-15
|
||||
needs: [build-pgxn, build-walproposer-lib]
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
@@ -55,53 +203,41 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Restore "pg_install/" cache
|
||||
id: cache_pg
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
- name: Download "pg_install/v14" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
path: pg_install
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-install-v14-${{ hashFiles('Makefile', 'postgres.mk', 'vendor/revisions.json') }}
|
||||
name: pg_install--v14
|
||||
path: pg_install/v14
|
||||
|
||||
- name: Checkout vendor/postgres submodules
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git submodule init
|
||||
git submodule update --depth 1 --recursive
|
||||
- name: Download "pg_install/v15" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v15
|
||||
path: pg_install/v15
|
||||
|
||||
- name: Build Postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres -j$(sysctl -n hw.ncpu)
|
||||
- name: Download "pg_install/v16" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v16
|
||||
path: pg_install/v16
|
||||
|
||||
# This isn't strictly necessary, but it makes the cached and non-cached builds more similar,
|
||||
# When pg_install is restored from cache, there is no 'build/' directory. By removing it
|
||||
# in a non-cached build too, we enforce that the rest of the steps don't depend on it,
|
||||
# so that we notice any build caching bugs earlier.
|
||||
- name: Remove build artifacts
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
rm -rf build
|
||||
- name: Download "pg_install/v17" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: pg_install--v17
|
||||
path: pg_install/v17
|
||||
|
||||
# Explicitly update the rust toolchain before running 'make'. The parallel make build can
|
||||
# invoke 'cargo build' more than once in parallel, for different crates. That's OK, 'cargo'
|
||||
# does its own locking to prevent concurrent builds from stepping on each other's
|
||||
# toes. However, it will first try to update the toolchain, and that step is not locked the
|
||||
# same way. To avoid two toolchain updates running in parallel and stepping on each other's
|
||||
# toes, ensure that the toolchain is up-to-date beforehand.
|
||||
- name: Update rust toolchain
|
||||
- name: Download "build/walproposer-lib" artifact
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
||||
with:
|
||||
name: build--walproposer-lib
|
||||
path: build/walproposer-lib
|
||||
|
||||
# `actions/download-artifact` doesn't preserve permissions:
|
||||
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
|
||||
- name: Make pg_install/v*/bin/* executable
|
||||
run: |
|
||||
rustup --version &&
|
||||
rustup update &&
|
||||
rustup show
|
||||
chmod +x pg_install/v*/bin/*
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
@@ -113,12 +249,17 @@ jobs:
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Build the neon-specific postgres extensions, and all the Rust bits.
|
||||
#
|
||||
# Pass PG_INSTALL_CACHED=1 because PostgreSQL was already built and cached
|
||||
# separately.
|
||||
- name: Build all
|
||||
run: PG_INSTALL_CACHED=1 BUILD_TYPE=release make -j$(sysctl -n hw.ncpu) all
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
brew install flex bison openssl protobuf icu4c
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -306,14 +306,14 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
||||
4
.github/workflows/periodic_pagebench.yml
vendored
4
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic pagebench performance test on unit-perf-aws-arm runners
|
||||
name: Periodic pagebench performance test on unit-perf hetzner runner
|
||||
|
||||
on:
|
||||
schedule:
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
|
||||
4
.github/workflows/proxy-benchmark.yml
vendored
4
.github/workflows/proxy-benchmark.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic proxy performance test on unit-perf-aws-arm runners
|
||||
name: Periodic proxy performance test on unit-perf hetzner runner
|
||||
|
||||
on:
|
||||
push: # TODO: remove after testing
|
||||
@@ -32,7 +32,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [self-hosted, unit-perf-aws-arm]
|
||||
runs-on: [self-hosted, unit-perf]
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -25,6 +25,11 @@ compaction-suite-results.*
|
||||
*.o
|
||||
*.so
|
||||
*.Po
|
||||
*.pid
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
# various files for local testing
|
||||
/proxy/.subzero
|
||||
local_proxy.json
|
||||
|
||||
3281
Cargo.lock
generated
3281
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -42,12 +42,10 @@ members = [
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
"libs/postgres_initdb",
|
||||
"libs/proxy/json",
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -257,7 +255,6 @@ desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
@@ -287,7 +284,6 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.29.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
53
Dockerfile
53
Dockerfile
@@ -30,18 +30,7 @@ ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
|
||||
|
||||
# Naive way:
|
||||
#
|
||||
# 1. COPY . .
|
||||
# 1. make neon-pg-ext
|
||||
# 2. cargo build <storage binaries>
|
||||
#
|
||||
# But to enable docker to cache intermediate layers, we perform a few preparatory steps:
|
||||
#
|
||||
# - Build all postgres versions, depending on just the contents of vendor/
|
||||
# - Use cargo chef to build all rust dependencies
|
||||
|
||||
# 1. Build all postgres versions
|
||||
# Build Postgres
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -49,15 +38,17 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot postgres.mk postgres.mk
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE=release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s postgres
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
|
||||
|
||||
# 2. Prepare cargo-chef recipe
|
||||
# Prepare cargo-chef recipe
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS plan
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -65,22 +56,23 @@ COPY --chown=nonroot . .
|
||||
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# Main build image
|
||||
# Build neon binaries
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS=""
|
||||
|
||||
# 3. Build cargo dependencies. Note that this step doesn't depend on anything else than
|
||||
# `recipe.json`, so the layer can be reused as long as none of the dependencies change.
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
RUN set -e \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
|
||||
|
||||
# Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build'
|
||||
# layer, and the cargo dependencies built in the previous step.
|
||||
COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
RUN set -e \
|
||||
@@ -95,10 +87,10 @@ RUN set -e \
|
||||
--bin endpoint_storage \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext
|
||||
--locked --release
|
||||
|
||||
# Assemble the final image
|
||||
# Build final image
|
||||
#
|
||||
FROM $BASE_IMAGE_SHA
|
||||
WORKDIR /data
|
||||
|
||||
@@ -138,15 +130,12 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
COPY --from=build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=build /home/nonroot/pg_install/v17 /usr/local/v17/
|
||||
|
||||
# Deprecated: Old deployment scripts use this tarball which contains all the Postgres binaries.
|
||||
# That's obsolete, since all the same files are also present under /usr/local/v*. But to keep the
|
||||
# old scripts working for now, create the tarball.
|
||||
RUN tar -C /usr/local -cvzf /data/postgres_install.tar.gz v14 v15 v16 v17
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
|
||||
16
Makefile
16
Makefile
@@ -30,18 +30,11 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_CFLAGS += -O2 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
CARGO_PROFILE ?= --profile=release
|
||||
# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
|
||||
# the final build artifacts. There is unfortunately no easy way of changing
|
||||
# it to a fully predictable path, nor to extract the path with a simple
|
||||
# command. See https://github.com/rust-lang/cargo/issues/9661 and
|
||||
# https://github.com/rust-lang/cargo/issues/6790.
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
CARGO_PROFILE ?= --profile=dev
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -109,7 +102,7 @@ all: neon postgres-install neon-pg-ext
|
||||
|
||||
### Neon Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' crate depends on the Postgres headers.
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers-install walproposer-lib cargo-target-dir
|
||||
+@echo "Compiling Neon"
|
||||
@@ -122,13 +115,10 @@ cargo-target-dir:
|
||||
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-install-% cargo-target-dir
|
||||
neon-pg-ext-%: postgres-install-%
|
||||
+@echo "Compiling neon-specific Postgres extensions for $*"
|
||||
mkdir -p $(BUILD_DIR)/pgxn-$*
|
||||
$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
|
||||
CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
|
||||
CARGO_PROFILE="$(CARGO_PROFILE)" \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
-C $(BUILD_DIR)/pgxn-$*\
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/Makefile install
|
||||
|
||||
|
||||
@@ -1636,14 +1636,11 @@ RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-with-cargo AS neon-ext-build
|
||||
FROM pg-build AS neon-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
USER root
|
||||
COPY . .
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute \
|
||||
BUILD_TYPE=release CARGO_BUILD_FLAGS="--locked --release" NEON_CARGO_ARTIFACT_TARGET_DIR="$(pwd)/target/release"
|
||||
COPY pgxn/ pgxn/
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{spawn, time};
|
||||
use tokio::spawn;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -108,8 +107,6 @@ pub struct ComputeNodeParams {
|
||||
pub installed_extensions_collection_interval: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
type TaskHandle = Mutex<Option<JoinHandle<()>>>;
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub params: ComputeNodeParams,
|
||||
@@ -132,8 +129,7 @@ pub struct ComputeNode {
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
/// Handle to the extension stats collection task
|
||||
extension_stats_task: TaskHandle,
|
||||
lfc_offload_task: TaskHandle,
|
||||
extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -372,7 +368,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
|
||||
|
||||
struct PostgresHandle {
|
||||
postgres: std::process::Child,
|
||||
log_collector: JoinHandle<Result<()>>,
|
||||
log_collector: tokio::task::JoinHandle<Result<()>>,
|
||||
}
|
||||
|
||||
impl PostgresHandle {
|
||||
@@ -386,7 +382,7 @@ struct StartVmMonitorResult {
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<JoinHandle<Result<()>>>,
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
@@ -437,7 +433,6 @@ impl ComputeNode {
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
compute_ctl_config: config.compute_ctl_config,
|
||||
extension_stats_task: Mutex::new(None),
|
||||
lfc_offload_task: Mutex::new(None),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -525,8 +520,8 @@ impl ComputeNode {
|
||||
None
|
||||
};
|
||||
|
||||
// Terminate the extension stats collection task
|
||||
this.terminate_extension_stats_task();
|
||||
this.terminate_lfc_offload_task();
|
||||
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
@@ -856,15 +851,12 @@ impl ComputeNode {
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
|
||||
|
||||
// Spawn the extension stats background task
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.autoprewarm {
|
||||
info!("autoprewarming on startup as requested");
|
||||
self.prewarm_lfc(None);
|
||||
}
|
||||
if let Some(seconds) = pspec.spec.offload_lfc_interval_seconds {
|
||||
self.spawn_lfc_offload_task(Duration::from_secs(seconds.into()));
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2365,7 +2357,10 @@ LIMIT 100",
|
||||
}
|
||||
|
||||
pub fn spawn_extension_stats_task(&self) {
|
||||
self.terminate_extension_stats_task();
|
||||
// Cancel any existing task
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
let conf = self.tokio_conn_conf.clone();
|
||||
let atomic_interval = self.params.installed_extensions_collection_interval.clone();
|
||||
@@ -2401,30 +2396,8 @@ LIMIT 100",
|
||||
}
|
||||
|
||||
fn terminate_extension_stats_task(&self) {
|
||||
if let Some(h) = self.extension_stats_task.lock().unwrap().take() {
|
||||
h.abort()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
|
||||
self.terminate_lfc_offload_task();
|
||||
let secs = interval.as_secs();
|
||||
info!("spawning lfc offload worker with {secs}s interval");
|
||||
let this = self.clone();
|
||||
let handle = spawn(async move {
|
||||
let mut interval = time::interval(interval);
|
||||
interval.tick().await; // returns immediately
|
||||
loop {
|
||||
interval.tick().await;
|
||||
this.offload_lfc_async().await;
|
||||
}
|
||||
});
|
||||
*self.lfc_offload_task.lock().unwrap() = Some(handle);
|
||||
}
|
||||
|
||||
fn terminate_lfc_offload_task(&self) {
|
||||
if let Some(h) = self.lfc_offload_task.lock().unwrap().take() {
|
||||
h.abort()
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use compute_api::responses::LfcOffloadState;
|
||||
use compute_api::responses::LfcPrewarmState;
|
||||
use http::StatusCode;
|
||||
use reqwest::Client;
|
||||
use std::mem::replace;
|
||||
use std::sync::Arc;
|
||||
use tokio::{io::AsyncReadExt, spawn};
|
||||
use tracing::{error, info};
|
||||
@@ -89,15 +88,17 @@ impl ComputeNode {
|
||||
self.state.lock().unwrap().lfc_offload_state.clone()
|
||||
}
|
||||
|
||||
/// If there is a prewarm request ongoing, return false, true otherwise
|
||||
/// Returns false if there is a prewarm request ongoing, true otherwise
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
crate::metrics::LFC_PREWARM_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
|
||||
if let LfcPrewarmState::Prewarming =
|
||||
std::mem::replace(state, LfcPrewarmState::Prewarming)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
crate::metrics::LFC_PREWARMS.inc();
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
@@ -151,39 +152,30 @@ impl ComputeNode {
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// If offload request is ongoing, return false, true otherwise
|
||||
/// Returns false if there is an offload request ongoing, true otherwise
|
||||
pub fn offload_lfc(self: &Arc<Self>) -> bool {
|
||||
crate::metrics::LFC_OFFLOAD_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
if let LfcOffloadState::Offloading =
|
||||
std::mem::replace(state, LfcOffloadState::Offloading)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move { cloned.offload_lfc_with_state_update().await });
|
||||
true
|
||||
}
|
||||
|
||||
pub async fn offload_lfc_async(self: &Arc<Self>) {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.offload_lfc_impl().await else {
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
}
|
||||
}
|
||||
self.offload_lfc_with_state_update().await
|
||||
}
|
||||
|
||||
async fn offload_lfc_with_state_update(&self) {
|
||||
crate::metrics::LFC_OFFLOADS.inc();
|
||||
let Err(err) = self.offload_lfc_impl().await else {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
};
|
||||
error!(%err);
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
});
|
||||
true
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
|
||||
@@ -97,18 +97,20 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm.
|
||||
/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm
|
||||
pub(crate) static LFC_PREWARM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_prewarms_total",
|
||||
"Total number of LFC prewarms requested by compute_ctl or autoprewarm option",
|
||||
"compute_ctl_lfc_prewarm_requests_total",
|
||||
"Total number of LFC prewarm requests made by compute_ctl",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offloads_total",
|
||||
"Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option",
|
||||
"compute_ctl_lfc_offload_requests_total",
|
||||
"Total number of LFC offload requests made by compute_ctl",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -122,7 +124,7 @@ pub fn collect() -> Vec<MetricFamily> {
|
||||
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
|
||||
metrics.extend(PG_CURR_DOWNTIME_MS.collect());
|
||||
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
|
||||
metrics.extend(LFC_PREWARMS.collect());
|
||||
metrics.extend(LFC_OFFLOADS.collect());
|
||||
metrics.extend(LFC_PREWARM_REQUESTS.collect());
|
||||
metrics.extend(LFC_OFFLOAD_REQUESTS.collect());
|
||||
metrics
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ mod pg_helpers_tests {
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
autoprewarm = off
|
||||
offload_lfc_interval_seconds = 20
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
|
||||
@@ -675,16 +675,6 @@ struct EndpointStartCmdArgs {
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Download LFC cache from endpoint storage on endpoint startup",
|
||||
default_value = "false"
|
||||
)]
|
||||
autoprewarm: bool,
|
||||
|
||||
#[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
|
||||
offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Run in development mode, skipping VM-specific operations like process termination",
|
||||
@@ -1595,24 +1585,22 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let endpoint_storage_token = env.generate_auth_token(&claims)?;
|
||||
let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string();
|
||||
|
||||
let args = control_plane::endpoint::EndpointStartArgs {
|
||||
auth_token,
|
||||
endpoint_storage_token,
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
start_timeout: args.start_timeout,
|
||||
autoprewarm: args.autoprewarm,
|
||||
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
||||
dev: args.dev,
|
||||
};
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(args).await?;
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
endpoint_storage_token,
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_base_url.as_ref(),
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
args.dev,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Reconfigure(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
|
||||
@@ -373,22 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EndpointStartArgs {
|
||||
pub auth_token: Option<String>,
|
||||
pub endpoint_storage_token: String,
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
pub start_timeout: Duration,
|
||||
pub autoprewarm: bool,
|
||||
pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
pub dev: bool,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -693,7 +677,21 @@ impl Endpoint {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn start(&self, args: EndpointStartArgs) -> Result<()> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
endpoint_storage_token: String,
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
dev: bool,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -706,10 +704,10 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
@@ -737,7 +735,7 @@ impl Endpoint {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if args.create_test_user {
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
@@ -746,7 +744,7 @@ impl Endpoint {
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if args.create_test_user {
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
@@ -768,21 +766,20 @@ impl Endpoint {
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(args.shard_stripe_size),
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(args.endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(args.endpoint_storage_token),
|
||||
autoprewarm: args.autoprewarm,
|
||||
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
autoprewarm: false,
|
||||
suspend_timeout_seconds: -1, // Only used in neon_local.
|
||||
};
|
||||
|
||||
@@ -794,7 +791,7 @@ impl Endpoint {
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if args.create_test_user {
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
@@ -829,7 +826,7 @@ impl Endpoint {
|
||||
// Launch compute_ctl
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{conn_str}'");
|
||||
if args.create_test_user {
|
||||
if create_test_user {
|
||||
let conn_str = self.connstr("test", "neondb");
|
||||
println!("Also at '{conn_str}'");
|
||||
}
|
||||
@@ -861,11 +858,11 @@ impl Endpoint {
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_base_url) = args.remote_ext_base_url {
|
||||
cmd.args(["--remote-ext-base-url", &remote_ext_base_url]);
|
||||
if let Some(remote_ext_base_url) = remote_ext_base_url {
|
||||
cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
|
||||
}
|
||||
|
||||
if args.dev {
|
||||
if dev {
|
||||
cmd.arg("--dev");
|
||||
}
|
||||
|
||||
@@ -897,11 +894,10 @@ impl Endpoint {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
let timeout = args.start_timeout;
|
||||
if Instant::now().duration_since(start_at) > timeout {
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
bail!(
|
||||
"compute startup timed out {:?}; still in Init state",
|
||||
timeout
|
||||
start_timeout
|
||||
);
|
||||
}
|
||||
// keep retrying
|
||||
@@ -929,10 +925,9 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if Instant::now().duration_since(start_at) > args.start_timeout {
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
return Err(e).context(format!(
|
||||
"timed out {:?} waiting to connect to compute_ctl HTTP",
|
||||
args.start_timeout
|
||||
"timed out {start_timeout:?} waiting to connect to compute_ctl HTTP",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,27 +65,12 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
/// Exists for backup usage and will be removed in future.
|
||||
/// Use [`Command::NodeStartDelete`] instead, if possible.
|
||||
// Set a node status as deleted.
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Start deletion of the specified pageserver.
|
||||
NodeStartDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel deletion of the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
NodeCancelDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Delete a tombstone of node from the storage controller.
|
||||
/// This is used when we want to allow the node to be re-registered.
|
||||
NodeDeleteTombstone {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
@@ -927,43 +912,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDelete { node_id } => {
|
||||
eprintln!("Warning: This command is obsolete and will be removed in a future version");
|
||||
eprintln!("Use `NodeStartDelete` instead, if possible");
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeStartDelete { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/delete"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Delete started for {node_id}");
|
||||
}
|
||||
Command::NodeCancelDelete { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/delete"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
!matches!(sched, NodeSchedulingPolicy::Deleting)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Delete was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::NodeDeleteTombstone { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
|
||||
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
|
||||
up all computes for the change. Neither we want to fully reimplement the leader
|
||||
logic second time outside compute. Because of that the proposed algorithm relies
|
||||
for issuing configurations on the external fault tolerant (distributed) strongly
|
||||
consistent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
consisent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
Properly configured postgres suits this.
|
||||
|
||||
In the system consensus is implemented at the timeline level, so algorithm below
|
||||
@@ -34,7 +34,7 @@ A configuration is
|
||||
|
||||
```
|
||||
struct Configuration {
|
||||
generation: SafekeeperGeneration, // a number uniquely identifying configuration
|
||||
generation: Generation, // a number uniquely identifying configuration
|
||||
sk_set: Vec<NodeId>, // current safekeeper set
|
||||
new_sk_set: Optional<Vec<NodeId>>,
|
||||
}
|
||||
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
struct TimelineMembershipSwitchResponse {
|
||||
struct ConfigurationSwitchResponse {
|
||||
conf: Configuration,
|
||||
term: Term,
|
||||
last_log_term: Term,
|
||||
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
|
||||
To be elected it must receive votes from both majorities if `new_sk_set` is present.
|
||||
To be elected it must receive votes from both majorites if `new_sk_set` is present.
|
||||
Similarly, to commit WAL it must receive flush acknowledge from both majorities.
|
||||
|
||||
If walproposer hears from safekeeper configuration higher than his own (i.e.
|
||||
@@ -130,7 +130,7 @@ storage are reachable.
|
||||
1) Fetch current timeline configuration from the configuration storage.
|
||||
2) If it is already joint one and `new_set` is different from `desired_set`
|
||||
refuse to change. However, assign join conf to (in memory) var
|
||||
`joint_conf` and proceed to step 4 to finish the ongoing change.
|
||||
`join_conf` and proceed to step 4 to finish the ongoing change.
|
||||
3) Else, create joint `joint_conf: Configuration`: increment current conf number
|
||||
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
|
||||
storage by doing CAS on the current generation: change happens only if
|
||||
@@ -161,11 +161,11 @@ storage are reachable.
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is needed because
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
@@ -178,12 +178,12 @@ spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarios like mentioned
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
@@ -192,7 +192,7 @@ considerations.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not member of it).
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
|
||||
@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until it succeeds.
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeeper list from control plane it needs to know the
|
||||
generation to check whether it should be updated (note that compute may get
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
@@ -305,8 +305,8 @@ enum MigrationRequest {
|
||||
```
|
||||
|
||||
`FinishPending` requests to run the procedure to ensure state is clean: current
|
||||
configuration is not joint and the majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If the current configuration fetched on step 1 is
|
||||
configuration is not joint and majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If current configuration fetched on step 1 is
|
||||
not joint it jumps to step 7. It should be run at startup for all timelines (but
|
||||
similarly, in the first version it is ok to trigger it manually).
|
||||
|
||||
@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
|
||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
||||
`scheduling_policy`: it is enough to have at least in the beginning only 3
|
||||
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
|
||||
3) `decommissioned` (node is removed).
|
||||
3) `decomissioned` (node is removed).
|
||||
|
||||
`timelines` table:
|
||||
```
|
||||
@@ -326,10 +326,9 @@ table! {
|
||||
tenant_id -> Varchar,
|
||||
start_lsn -> pg_lsn,
|
||||
generation -> Int4,
|
||||
sk_set -> Array<Int8>, // list of safekeeper ids
|
||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
||||
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
|
||||
cplane_notified_generation -> Int4,
|
||||
sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
|
||||
deleted_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
@@ -339,23 +338,13 @@ table! {
|
||||
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
|
||||
this RFC it is not needed.
|
||||
|
||||
`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
|
||||
track the last stage of the algorithm, when we need to notify safekeeper set and cplane
|
||||
with the final configuration after it's already committed to DB.
|
||||
|
||||
The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
|
||||
`*_notified_generation` fields are up to date with `generation`.
|
||||
|
||||
It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
|
||||
but for better observability it's nice to have them separately.
|
||||
|
||||
#### API
|
||||
|
||||
Node management is similar to pageserver:
|
||||
1) POST `/control/v1/safekeeper` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeeper` lists safekeepers.
|
||||
3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
|
||||
1) POST `/control/v1/safekeepers` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
||||
`offline` or `decomissioned`. Initially it is simpler not to schedule any
|
||||
migrations here.
|
||||
|
||||
@@ -379,8 +368,8 @@ Migration API: the first version is the simplest and the most imperative:
|
||||
all timelines from one safekeeper to another. It accepts json
|
||||
```
|
||||
{
|
||||
"src_sk": NodeId,
|
||||
"dst_sk": NodeId,
|
||||
"src_sk": u32,
|
||||
"dst_sk": u32,
|
||||
"limit": Optional<u32>,
|
||||
}
|
||||
```
|
||||
@@ -390,15 +379,12 @@ Returns list of scheduled requests.
|
||||
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
|
||||
to move single timeline to given set of safekeepers:
|
||||
```
|
||||
struct TimelineSafekeeperMigrateRequest {
|
||||
"new_sk_set": Vec<NodeId>,
|
||||
{
|
||||
"desired_set": Vec<u32>,
|
||||
}
|
||||
```
|
||||
|
||||
In the first version the handler migrates the timeline to `new_sk_set` synchronously.
|
||||
Should be retried until success.
|
||||
|
||||
In the future we might change it to asynchronous API and return scheduled request.
|
||||
Returns scheduled request.
|
||||
|
||||
Similar call should be added for the tenant.
|
||||
|
||||
@@ -448,9 +434,6 @@ table! {
|
||||
}
|
||||
```
|
||||
|
||||
We load all pending ops from the table on startup into the memory.
|
||||
The table is needed only to preserve the state between restarts.
|
||||
|
||||
`op_type` can be `include` (seed from peers and ensure generation is up to
|
||||
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
|
||||
needed as it can be computed from current configuration, but gives more explicit
|
||||
@@ -491,7 +474,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
|
||||
the initial (tenant creation) call cplane doesn't know it. However, setting
|
||||
start_lsn on safekeepers during creation is a good thing -- it provides a
|
||||
guarantee that walproposer can always find a common point in WAL histories of
|
||||
safekeeper and its own, and so absence of it would be a clear sign of
|
||||
safekeeper and its own, and so absense of it would be a clear sign of
|
||||
corruption. The following sequence works:
|
||||
1) Create timeline (or observe that it exists) on pageserver,
|
||||
figuring out last_record_lsn in response.
|
||||
@@ -514,9 +497,11 @@ corruption. The following sequence works:
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. In the current implementation
|
||||
we first persist the request in the DB, and then send an in-memory request
|
||||
to each safekeeper reconciler to process it.
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
table, it is not necessary as initial creation at step 3 can take them from
|
||||
@@ -524,40 +509,30 @@ pageserver or cplane creation call and later pull_timeline will carry them
|
||||
around.
|
||||
|
||||
Timeline migration.
|
||||
1) CAS to the db to create joint conf. Since this moment the migration is considered to be
|
||||
"in progress". We can detect all "in-progress" migrations looking into the database.
|
||||
2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
|
||||
configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
|
||||
so we don't need to persist anything in the database at this stage. If any errors occur,
|
||||
it's safe to retry or abort the migration.
|
||||
3) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
|
||||
in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
|
||||
CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
|
||||
need to have them persisted somewhere in case the migration is interrupted right after the CAS.
|
||||
4) Finish the migration. The final membership configuration is committed to the DB at this stage.
|
||||
So, the migration can not be aborted anymore. But it can still be retried if the migration fails
|
||||
past stage 3. To finish the migration we need to send the new membership configuration to
|
||||
a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
|
||||
requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
|
||||
possible that we have already committed `exclude` requests to DB, but didn't send them to
|
||||
the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
|
||||
because it's the only place where they are persistent. The fields `sk_set_notified_generation`
|
||||
and `cplane_notified_generation` are updated after each step. The migration is considered
|
||||
fully completed when they match the `generation` field.
|
||||
|
||||
In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
|
||||
reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
|
||||
so the timeline is always in a "good state" and doesn't require an old quorum to commit
|
||||
WAL after the migration reported "success".
|
||||
1) CAS to the db to create joint conf, and in the same transaction create
|
||||
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
|
||||
per sk reconcilers.
|
||||
|
||||
When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
|
||||
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
|
||||
for it must be cleared in the same transaction.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
|
||||
|
||||
#### Dealing with multiple instances of storage_controller
|
||||
|
||||
Operations described above executed concurrently might create some errors but do
|
||||
@@ -566,7 +541,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
||||
|
||||
To harden against some controller instance creating some work in
|
||||
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
|
||||
the job per sk reconcilers apart from explicit wakeups should scan for work
|
||||
the job per sk reconcilers apart from explicit wakups should scan for work
|
||||
periodically. It is possible to remove that though if all db updates are
|
||||
protected with leadership token/term -- then such scans are needed only after
|
||||
leadership is acquired.
|
||||
@@ -588,7 +563,7 @@ There should be following layers of tests:
|
||||
safekeeper communication and pull_timeline need to be mocked and main switch
|
||||
procedure wrapped to as a node (thread) in simulation tests, using these
|
||||
mocks. Test would inject migrations like it currently injects
|
||||
safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
|
||||
safekeeper/walproposer restars. Main assert is the same -- committed WAL must
|
||||
not be lost.
|
||||
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
@@ -638,7 +613,7 @@ Let's have the following implementation bits for gradual rollout:
|
||||
`notify-safekeepers`.
|
||||
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are chosen by control_plane.
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
|
||||
@@ -58,7 +58,7 @@ pub enum LfcPrewarmState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
#[default]
|
||||
|
||||
@@ -181,14 +181,10 @@ pub struct ComputeSpec {
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
/// Download LFC state from endpoint storage and pass it to Postgres on compute startup
|
||||
pub autoprewarm: bool,
|
||||
|
||||
#[serde(default)]
|
||||
/// Upload LFC state to endpoint storage periodically. Default value (None) means "don't upload"
|
||||
pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
|
||||
/// Suspend timeout in seconds.
|
||||
///
|
||||
/// We use this value to derive other values, such as the installed extensions metric.
|
||||
|
||||
@@ -90,11 +90,6 @@
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "offload_lfc_interval_seconds",
|
||||
"value": "20",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
|
||||
|
||||
@@ -386,7 +386,6 @@ pub enum NodeSchedulingPolicy {
|
||||
Pause,
|
||||
PauseForRestart,
|
||||
Draining,
|
||||
Deleting,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
@@ -399,7 +398,6 @@ impl FromStr for NodeSchedulingPolicy {
|
||||
"pause" => Ok(Self::Pause),
|
||||
"pause_for_restart" => Ok(Self::PauseForRestart),
|
||||
"draining" => Ok(Self::Draining),
|
||||
"deleting" => Ok(Self::Deleting),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
@@ -414,7 +412,6 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
Pause => "pause",
|
||||
PauseForRestart => "pause_for_restart",
|
||||
Draining => "draining",
|
||||
Deleting => "deleting",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
[package]
|
||||
name = "json"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
ryu = "1"
|
||||
itoa = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
futures = "0.3"
|
||||
@@ -1,412 +0,0 @@
|
||||
//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
|
||||
//!
|
||||
//! Features:
|
||||
//!
|
||||
//! ## Dynamic construction
|
||||
//!
|
||||
//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
|
||||
//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
|
||||
//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
|
||||
//! but that is often not very efficient.
|
||||
//!
|
||||
//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
|
||||
//! relevant functions, and it will guarantee a correctly encoded JSON value.
|
||||
//!
|
||||
//! ## Async construction
|
||||
//!
|
||||
//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
|
||||
//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
|
||||
//! there are exceptions.
|
||||
//!
|
||||
//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
|
||||
//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
|
||||
//!
|
||||
//! ## Examples
|
||||
//!
|
||||
//! To represent the following JSON as a compact string
|
||||
//!
|
||||
//! ```json
|
||||
//! {
|
||||
//! "results": {
|
||||
//! "rows": [
|
||||
//! {
|
||||
//! "id": 1,
|
||||
//! "value": null
|
||||
//! },
|
||||
//! {
|
||||
//! "id": 2,
|
||||
//! "value": "hello"
|
||||
//! }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! We can use the following code:
|
||||
//!
|
||||
//! ```
|
||||
//! // create the outer object
|
||||
//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
|
||||
//! // create an entry with key "results" and start an object value associated with it.
|
||||
//! let results = v.key("results");
|
||||
//! json::value_as_object!(|results| {
|
||||
//! // create an entry with key "rows" and start an list value associated with it.
|
||||
//! let rows = results.key("rows");
|
||||
//! json::value_as_list!(|rows| {
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 1
|
||||
//! row.entry("id", 1);
|
||||
//! // add entry "value": null
|
||||
//! row.entry("value", json::Null);
|
||||
//! });
|
||||
//!
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 2
|
||||
//! row.entry("id", 2);
|
||||
//! // add entry "value": "hello"
|
||||
//! row.entry("value", "hello");
|
||||
//! });
|
||||
//! });
|
||||
//! });
|
||||
//! }));
|
||||
//!
|
||||
//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
|
||||
//! ```
|
||||
|
||||
mod macros;
|
||||
mod str;
|
||||
mod value;
|
||||
|
||||
pub use value::{Null, ValueEncoder};
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a single json value.
|
||||
pub struct ValueSer<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ValueSer<'buf> {
|
||||
/// Create a new json value serializer.
|
||||
pub fn new(buf: &'buf mut Vec<u8>) -> Self {
|
||||
Self { buf, start: 0 }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.buf
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn value(self, e: impl ValueEncoder) {
|
||||
e.encode(self);
|
||||
}
|
||||
|
||||
/// Write raw bytes to the buf. This must be already JSON encoded.
|
||||
#[inline]
|
||||
pub fn write_raw_json(self, data: &[u8]) {
|
||||
self.buf.extend_from_slice(data);
|
||||
self.finish();
|
||||
}
|
||||
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn object(self) -> ObjectSer<'buf> {
|
||||
ObjectSer::new(self)
|
||||
}
|
||||
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn list(self) -> ListSer<'buf> {
|
||||
ListSer::new(self)
|
||||
}
|
||||
|
||||
/// Finish the value ser.
|
||||
#[inline]
|
||||
fn finish(self) {
|
||||
// don't trigger the drop handler which triggers a rollback.
|
||||
// this won't cause memory leaks because `ValueSet` owns no allocations.
|
||||
std::mem::forget(self);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ValueSer<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.buf.truncate(self.start);
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ObjectSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ObjectSer<'buf> {
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'{');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
|
||||
#[inline]
|
||||
pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
|
||||
key.write_key(self)
|
||||
}
|
||||
|
||||
/// Write an entry (key-value pair) to the object.
|
||||
#[inline]
|
||||
pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
|
||||
self.key(key).value(val);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
// push key
|
||||
f(self.value.buf);
|
||||
// push value separator
|
||||
self.value.buf.push(b':');
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `{`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b'}');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
pub trait KeyEncoder {
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ListSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ListSer<'buf> {
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'[');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Write an value to the list.
|
||||
#[inline]
|
||||
pub fn push(&mut self, val: impl ValueEncoder) {
|
||||
self.entry().value(val);
|
||||
}
|
||||
|
||||
/// Start a new value entry in this list.
|
||||
#[inline]
|
||||
pub fn entry(&mut self) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `[`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b']');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{Null, ValueSer};
|
||||
|
||||
#[test]
|
||||
fn object() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
object.entry("foo", "bar");
|
||||
object.entry("baz", Null);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list() {
|
||||
let mut buf = vec![];
|
||||
let mut list = ValueSer::new(&mut buf).list();
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
list.finish();
|
||||
|
||||
assert_eq!(buf, br#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_macro() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
obj.entry("foo", "bar");
|
||||
obj.entry("baz", Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_macro() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_on_drop() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
|
||||
'cancel: {
|
||||
let nested_list = list.entry();
|
||||
crate::value_as_list!(|nested_list| {
|
||||
nested_list.entry().value(1);
|
||||
|
||||
assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
|
||||
if true {
|
||||
break 'cancel;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
assert_eq!(list.as_buffer(), br#"["bar""#);
|
||||
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_object() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
let entry = obj.key("1");
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = obj.key("2");
|
||||
let entry = {
|
||||
let mut nested_obj = entry.object();
|
||||
nested_obj.entry("foo", "bar");
|
||||
nested_obj.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"1":1,"2":2}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_list() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
let entry = list.entry();
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = list.entry();
|
||||
let entry = {
|
||||
let mut nested_list = entry.list();
|
||||
nested_list.push("foo");
|
||||
nested_list.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"[1,2]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_escaping() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
|
||||
let key = "hello";
|
||||
let value = "\n world";
|
||||
|
||||
object.entry(format_args!("{key:?}"), value);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
|
||||
}
|
||||
}
|
||||
@@ -1,86 +0,0 @@
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```
|
||||
//! use futures::{StreamExt, TryStream, TryStreamExt};
|
||||
//!
|
||||
//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
|
||||
//! where
|
||||
//! S: TryStream<Ok = T, Error = E> + Unpin,
|
||||
//! T: json::ValueEncoder
|
||||
//! {
|
||||
//! Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
|
||||
//! // note how we can use `.await` and `?` in here.
|
||||
//! while let Some(value) = s.try_next().await? {
|
||||
//! val.push(value);
|
||||
//! }
|
||||
//! })))
|
||||
//! }
|
||||
//!
|
||||
//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
|
||||
//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
|
||||
//! assert_eq!(json_string, "[1,2,3]");
|
||||
//! ```
|
||||
|
||||
/// A helper to create a new JSON vec.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_vec {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut buf = vec![];
|
||||
let $val = $crate::ValueSer::new(&mut buf);
|
||||
let _: () = $body;
|
||||
buf
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper to create a new JSON string.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_string {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
|
||||
.expect("json should be valid utf8")
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_object {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut obj = $crate::ObjectSer::new($val);
|
||||
|
||||
let $val = &mut obj;
|
||||
let res = $body;
|
||||
|
||||
obj.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_list {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut list = $crate::ListSer::new($val);
|
||||
|
||||
let $val = &mut list;
|
||||
let res = $body;
|
||||
|
||||
list.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
@@ -1,166 +0,0 @@
|
||||
//! Helpers for serializing escaped strings.
|
||||
//!
|
||||
//! ## License
|
||||
//!
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
|
||||
//! Licensed by David Tolnay under MIT or Apache-2.0.
|
||||
//!
|
||||
//! With modifications by Conrad Ludgate on behalf of Databricks.
|
||||
|
||||
use std::fmt::{self, Write};
|
||||
|
||||
/// Represents a character escape code in a type-safe manner.
|
||||
pub enum CharEscape {
|
||||
/// An escaped quote `"`
|
||||
Quote,
|
||||
/// An escaped reverse solidus `\`
|
||||
ReverseSolidus,
|
||||
// /// An escaped solidus `/`
|
||||
// Solidus,
|
||||
/// An escaped backspace character (usually escaped as `\b`)
|
||||
Backspace,
|
||||
/// An escaped form feed character (usually escaped as `\f`)
|
||||
FormFeed,
|
||||
/// An escaped line feed character (usually escaped as `\n`)
|
||||
LineFeed,
|
||||
/// An escaped carriage return character (usually escaped as `\r`)
|
||||
CarriageReturn,
|
||||
/// An escaped tab character (usually escaped as `\t`)
|
||||
Tab,
|
||||
/// An escaped ASCII plane control character (usually escaped as
|
||||
/// `\u00XX` where `XX` are two hex characters)
|
||||
AsciiControl(u8),
|
||||
}
|
||||
|
||||
impl CharEscape {
|
||||
#[inline]
|
||||
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
|
||||
match escape {
|
||||
self::BB => CharEscape::Backspace,
|
||||
self::TT => CharEscape::Tab,
|
||||
self::NN => CharEscape::LineFeed,
|
||||
self::FF => CharEscape::FormFeed,
|
||||
self::RR => CharEscape::CarriageReturn,
|
||||
self::QU => CharEscape::Quote,
|
||||
self::BS => CharEscape::ReverseSolidus,
|
||||
self::UU => CharEscape::AsciiControl(byte),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
|
||||
writer.reserve(2 + value.len());
|
||||
|
||||
writer.push(b'"');
|
||||
|
||||
let rest = format_escaped_str_contents(writer, value);
|
||||
writer.extend_from_slice(rest);
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
|
||||
writer.push(b'"');
|
||||
|
||||
Collect { buf: writer }
|
||||
.write_fmt(args)
|
||||
.expect("formatting should not error");
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
struct Collect<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
}
|
||||
|
||||
impl fmt::Write for Collect<'_> {
|
||||
fn write_str(&mut self, s: &str) -> fmt::Result {
|
||||
let last = format_escaped_str_contents(self.buf, s);
|
||||
self.buf.extend(last);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// writes any escape sequences, and returns the suffix still needed to be written.
|
||||
fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
|
||||
let bytes = value.as_bytes();
|
||||
|
||||
let mut start = 0;
|
||||
|
||||
for (i, &byte) in bytes.iter().enumerate() {
|
||||
let escape = ESCAPE[byte as usize];
|
||||
if escape == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
writer.extend_from_slice(&bytes[start..i]);
|
||||
|
||||
let char_escape = CharEscape::from_escape_table(escape, byte);
|
||||
write_char_escape(writer, char_escape);
|
||||
|
||||
start = i + 1;
|
||||
}
|
||||
|
||||
&bytes[start..]
|
||||
}
|
||||
|
||||
const BB: u8 = b'b'; // \x08
|
||||
const TT: u8 = b't'; // \x09
|
||||
const NN: u8 = b'n'; // \x0A
|
||||
const FF: u8 = b'f'; // \x0C
|
||||
const RR: u8 = b'r'; // \x0D
|
||||
const QU: u8 = b'"'; // \x22
|
||||
const BS: u8 = b'\\'; // \x5C
|
||||
const UU: u8 = b'u'; // \x00...\x1F except the ones above
|
||||
const __: u8 = 0;
|
||||
|
||||
// Lookup table of escape sequences. A value of b'x' at index i means that byte
|
||||
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
|
||||
static ESCAPE: [u8; 256] = [
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
|
||||
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
|
||||
];
|
||||
|
||||
fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
|
||||
let s = match char_escape {
|
||||
CharEscape::Quote => b"\\\"",
|
||||
CharEscape::ReverseSolidus => b"\\\\",
|
||||
// CharEscape::Solidus => b"\\/",
|
||||
CharEscape::Backspace => b"\\b",
|
||||
CharEscape::FormFeed => b"\\f",
|
||||
CharEscape::LineFeed => b"\\n",
|
||||
CharEscape::CarriageReturn => b"\\r",
|
||||
CharEscape::Tab => b"\\t",
|
||||
CharEscape::AsciiControl(byte) => {
|
||||
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
|
||||
let bytes = &[
|
||||
b'\\',
|
||||
b'u',
|
||||
b'0',
|
||||
b'0',
|
||||
HEX_DIGITS[(byte >> 4) as usize],
|
||||
HEX_DIGITS[(byte & 0xF) as usize],
|
||||
];
|
||||
return writer.extend_from_slice(bytes);
|
||||
}
|
||||
};
|
||||
|
||||
writer.extend_from_slice(s);
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
use core::fmt;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
use crate::str::{format_escaped_fmt, format_escaped_str};
|
||||
use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
|
||||
|
||||
/// Write a value to the underlying json representation.
|
||||
pub trait ValueEncoder {
|
||||
fn encode(self, v: ValueSer<'_>);
|
||||
}
|
||||
|
||||
pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &T {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
T::encode(*self, v);
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for &str {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
format_escaped_str(v.buf, self);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
if let Some(s) = self.as_str() {
|
||||
format_escaped_str(v.buf, s);
|
||||
} else {
|
||||
format_escaped_fmt(v.buf, self);
|
||||
}
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! int {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_int(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
int![u8, u16, u32, u64, usize, u128];
|
||||
int![i8, i16, i32, i64, isize, i128];
|
||||
|
||||
macro_rules! float {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_float(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
float![f32, f64];
|
||||
|
||||
impl ValueEncoder for bool {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(if self { b"true" } else { b"false" });
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Option<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
match self {
|
||||
Some(value) => value.encode(v),
|
||||
None => Null.encode(v),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for &str {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
let obj = &mut *obj;
|
||||
obj.entry_inner(|b| format_escaped_str(b, self))
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
if let Some(key) = self.as_str() {
|
||||
obj.entry_inner(|b| format_escaped_str(b, key))
|
||||
} else {
|
||||
obj.entry_inner(|b| format_escaped_fmt(b, self))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the JSON null value.
|
||||
pub struct Null;
|
||||
|
||||
impl ValueEncoder for Null {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(b"null");
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Vec<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
pub last_log_term: Term,
|
||||
pub term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
}
|
||||
|
||||
|
||||
@@ -28,7 +28,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -460,16 +459,7 @@ impl PageServerConf {
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: Default::default(),
|
||||
},
|
||||
)),
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
@@ -707,8 +697,6 @@ impl ConfigurableSemaphore {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use rstest::rstest;
|
||||
use utils::id::NodeId;
|
||||
@@ -810,20 +798,4 @@ mod tests {
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_disk_usage_based_eviction_is_valid() {
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("disk_usage_based_eviction is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap();
|
||||
assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80);
|
||||
assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000);
|
||||
assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60));
|
||||
assert_eq!(disk_usage_based_eviction.eviction_order, Default::default());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,7 +50,6 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::service::Interceptor as _;
|
||||
use tonic::transport::server::TcpConnectInfo;
|
||||
use tracing::*;
|
||||
use utils::auth::{Claims, Scope, SwappableJwtAuth};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
@@ -3686,15 +3685,8 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
Err(status) => {
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
});
|
||||
page_api::GetPageResponse::try_from_status(status, req_id)?.into()
|
||||
}
|
||||
// the stream with a tonic::Status.
|
||||
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3832,85 +3824,40 @@ impl<S: tonic::server::NamedService> tonic::server::NamedService for Observabili
|
||||
const NAME: &'static str = S::NAME; // propagate inner service name
|
||||
}
|
||||
|
||||
impl<S, Req, Resp> tower::Service<http::Request<Req>> for ObservabilityLayerService<S>
|
||||
impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
|
||||
where
|
||||
S: tower::Service<http::Request<Req>, Response = http::Response<Resp>> + Send,
|
||||
S: tower::Service<http::Request<B>>,
|
||||
S::Future: Send + 'static,
|
||||
{
|
||||
type Response = S::Response;
|
||||
type Error = S::Error;
|
||||
type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;
|
||||
|
||||
fn call(&mut self, mut req: http::Request<Req>) -> Self::Future {
|
||||
fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
|
||||
// Record the request start time as a request extension.
|
||||
//
|
||||
// TODO: we should start a timer here instead, but it currently requires a timeline handle
|
||||
// and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
|
||||
req.extensions_mut().insert(ReceivedAt(Instant::now()));
|
||||
|
||||
// Extract the peer address and gRPC method.
|
||||
let peer = req
|
||||
.extensions()
|
||||
.get::<TcpConnectInfo>()
|
||||
.and_then(|info| info.remote_addr())
|
||||
.map(|addr| addr.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let method = req
|
||||
.uri()
|
||||
.path()
|
||||
.split('/')
|
||||
.nth(2)
|
||||
.unwrap_or(req.uri().path())
|
||||
.to_string();
|
||||
|
||||
// Create a basic tracing span.
|
||||
// Create a basic tracing span. Enter the span for the current thread (to use it for inner
|
||||
// sync code like interceptors), and instrument the future (to use it for inner async code
|
||||
// like the page service itself).
|
||||
//
|
||||
// Enter the span for the current thread and instrument the future. It is not sufficient to
|
||||
// only instrument the future, since it only takes effect after the future is returned and
|
||||
// polled, not when the inner service is called below (e.g. during interceptor execution).
|
||||
// The instrument() call below is not sufficient. It only affects the returned future, and
|
||||
// only takes effect when the caller polls it. Any sync code executed when we call
|
||||
// self.inner.call() below (such as interceptors) runs outside of the returned future, and
|
||||
// is not affected by it. We therefore have to enter the span on the current thread too.
|
||||
let span = info_span!(
|
||||
"grpc:pageservice",
|
||||
// These will be populated by TenantMetadataInterceptor.
|
||||
// Set by TenantMetadataInterceptor.
|
||||
tenant_id = field::Empty,
|
||||
timeline_id = field::Empty,
|
||||
shard_id = field::Empty,
|
||||
// NB: empty fields must be listed first above. Otherwise, the field names will be
|
||||
// clobbered when the empty fields are populated. They will be output last regardless.
|
||||
%peer,
|
||||
%method,
|
||||
);
|
||||
let _guard = span.enter();
|
||||
|
||||
// Construct a future for calling the inner service, but don't await it. This avoids having
|
||||
// to clone the inner service into the future below.
|
||||
let call = self.inner.call(req);
|
||||
|
||||
async move {
|
||||
// Await the inner service call.
|
||||
let result = call.await;
|
||||
|
||||
// Log gRPC error statuses. This won't include request info from handler spans, but it
|
||||
// will catch all errors (even those emitted before handler spans are constructed). Only
|
||||
// unary request errors are logged here, not streaming response errors.
|
||||
if let Ok(ref resp) = result
|
||||
&& let Some(status) = tonic::Status::from_header_map(resp.headers())
|
||||
&& status.code() != tonic::Code::Ok
|
||||
{
|
||||
// TODO: it would be nice if we could propagate the handler span's request fields
|
||||
// here. This could e.g. be done by attaching the request fields to
|
||||
// tonic::Status::metadata via a proc macro.
|
||||
warn!(
|
||||
"request failed with {:?}: {}",
|
||||
status.code(),
|
||||
status.message()
|
||||
);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
.instrument(span.clone())
|
||||
.boxed()
|
||||
Box::pin(self.inner.call(req).instrument(span.clone()))
|
||||
}
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
|
||||
|
||||
@@ -2144,31 +2144,14 @@ impl Timeline {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// cancel walreceiver to stop ingesting more data asap.
|
||||
//
|
||||
// Note that we're accepting a race condition here where we may
|
||||
// do the final flush below, before walreceiver observes the
|
||||
// cancellation and exits.
|
||||
// This means we may open a new InMemoryLayer after the final flush below.
|
||||
// Flush loop is also still running for a short while, so, in theory, it
|
||||
// could also make its way into the upload queue.
|
||||
//
|
||||
// If we wait for the shutdown of the walreceiver before moving on to the
|
||||
// flush, then that would be avoided. But we don't do it because the
|
||||
// walreceiver entertains reads internally, which means that it possibly
|
||||
// depends on the download of layers. Layer download is only sensitive to
|
||||
// the cancellation of the entire timeline, so cancelling the walreceiver
|
||||
// will have no effect on the individual get requests.
|
||||
// This would cause problems when there is a lot of ongoing downloads or
|
||||
// there is S3 unavailabilities, i.e. detach, deletion, etc would hang,
|
||||
// and we can't deallocate resources of the timeline, etc.
|
||||
// or not, stop ingesting any more data.
|
||||
let walreceiver = self.walreceiver.lock().unwrap().take();
|
||||
tracing::debug!(
|
||||
is_some = walreceiver.is_some(),
|
||||
"Waiting for WalReceiverManager..."
|
||||
);
|
||||
if let Some(walreceiver) = walreceiver {
|
||||
walreceiver.cancel().await;
|
||||
walreceiver.shutdown().await;
|
||||
}
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
@@ -63,6 +63,7 @@ pub struct WalReceiver {
|
||||
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
|
||||
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
|
||||
cancel: CancellationToken,
|
||||
task: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl WalReceiver {
|
||||
@@ -79,7 +80,7 @@ impl WalReceiver {
|
||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||
let manager_status = Arc::clone(&loop_status);
|
||||
let cancel = timeline.cancel.child_token();
|
||||
let _task = WALRECEIVER_RUNTIME.spawn({
|
||||
let task = WALRECEIVER_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -120,14 +121,25 @@ impl WalReceiver {
|
||||
Self {
|
||||
manager_status,
|
||||
cancel,
|
||||
task,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, level = tracing::Level::DEBUG)]
|
||||
pub async fn cancel(self) {
|
||||
pub async fn shutdown(self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("cancelling walreceiver tasks");
|
||||
self.cancel.cancel();
|
||||
match self.task.await {
|
||||
Ok(()) => debug!("Shutdown success"),
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// already logged by panic hook
|
||||
}
|
||||
Err(je) => {
|
||||
error!("shutdown walreceiver task join error: {je}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
|
||||
@@ -100,7 +100,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -157,10 +156,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => {
|
||||
broker_reset_interval.reset();
|
||||
connection_manager_state.register_timeline_update(broker_update);
|
||||
},
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
@@ -182,14 +178,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
_ = broker_reset_interval.tick() => {
|
||||
if wait_lsn_status.borrow().is_some() {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
|
||||
}
|
||||
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
loop {
|
||||
|
||||
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
|
||||
let walingest_res = select! {
|
||||
walingest_res = walingest_future => walingest_res,
|
||||
_ = cancellation.cancelled() => {
|
||||
// We are doing reads in WalIngest::new, and those can hang as they come from the network.
|
||||
// Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
|
||||
debug!("Connection cancelled");
|
||||
return Err(WalReceiverError::Cancelled);
|
||||
},
|
||||
};
|
||||
let mut walingest = walingest_res.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
|
||||
@@ -22,8 +22,7 @@ OBJS = \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
neon_ddl_handler.o \
|
||||
walsender_hooks.o \
|
||||
$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a
|
||||
walsender_hooks.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
@@ -55,17 +54,6 @@ WALPROP_OBJS = \
|
||||
neon_utils.o \
|
||||
walproposer_compat.o
|
||||
|
||||
# libcommunicator.a is built by cargo from the Rust sources under communicator/
|
||||
# subdirectory. `cargo build` also generates communicator_bindings.h.
|
||||
neon.o: communicator/communicator_bindings.h
|
||||
|
||||
$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
|
||||
(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
|
||||
|
||||
# Force `cargo build` every time. Some of the Rust sources might have
|
||||
# changed.
|
||||
.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h
|
||||
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
|
||||
walproposer-lib: libwalproposer.a;
|
||||
|
||||
2
pgxn/neon/communicator/.gitignore
vendored
2
pgxn/neon/communicator/.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
# generated file (with cbindgen, see build.rs)
|
||||
communicator_bindings.h
|
||||
@@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[features]
|
||||
# 'testing' feature is currently unused in the communicator, but we accept it for convenience of
|
||||
# calling build scripts, so that you can pass the same feature to all packages.
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
neon-shmem.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen.workspace = true
|
||||
@@ -1,8 +0,0 @@
|
||||
This package will evolve into a "compute-pageserver communicator"
|
||||
process and machinery. For now, it's just a dummy that doesn't do
|
||||
anything interesting, but it allows us to test the compilation and
|
||||
linking of Rust code into the Postgres extensions.
|
||||
|
||||
At compilation time, pgxn/neon/communicator/ produces a static
|
||||
library, libcommunicator.a. It is linked to the neon.so extension
|
||||
library.
|
||||
@@ -1,20 +0,0 @@
|
||||
use std::env;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
|
||||
match cbindgen::generate(crate_dir) {
|
||||
Ok(bindings) => {
|
||||
bindings.write_to_file("communicator_bindings.h");
|
||||
}
|
||||
Err(cbindgen::Error::ParseSyntaxError { .. }) => {
|
||||
// This means there was a syntax error in the Rust sources. Don't panic, because
|
||||
// we want the build to continue and the Rust compiler to hit the error. The
|
||||
// Rust compiler produces a better error message than cbindgen.
|
||||
eprintln!("Generating C bindings failed because of a Rust syntax error");
|
||||
}
|
||||
Err(err) => panic!("Unable to generate C bindings: {err:?}"),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
language = "C"
|
||||
|
||||
[enum]
|
||||
prefix_with_name = true
|
||||
@@ -1,6 +0,0 @@
|
||||
/// dummy function, just to test linking Rust functions into the C
|
||||
/// extension
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
|
||||
arg + 1
|
||||
}
|
||||
@@ -43,9 +43,6 @@
|
||||
#include "storage/ipc.h"
|
||||
#endif
|
||||
|
||||
/* the rust bindings, generated by cbindgen */
|
||||
#include "communicator/communicator_bindings.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
|
||||
@@ -455,9 +452,6 @@ _PG_init(void)
|
||||
shmem_startup_hook = neon_shmem_startup_hook;
|
||||
#endif
|
||||
|
||||
/* dummy call to a Rust function in the communicator library, to check that it works */
|
||||
(void) communicator_dummy(123);
|
||||
|
||||
pg_init_libpagestore();
|
||||
lfc_init();
|
||||
pg_init_walproposer();
|
||||
|
||||
@@ -98,14 +98,12 @@ typedef struct
|
||||
typedef struct DdlHashTable
|
||||
{
|
||||
struct DdlHashTable *prev_table;
|
||||
size_t subtrans_level;
|
||||
HTAB *db_table;
|
||||
HTAB *role_table;
|
||||
} DdlHashTable;
|
||||
|
||||
static DdlHashTable RootTable;
|
||||
static DdlHashTable *CurrentDdlTable = &RootTable;
|
||||
static int SubtransLevel; /* current nesting level of subtransactions */
|
||||
|
||||
static void
|
||||
PushKeyValue(JsonbParseState **state, char *key, char *value)
|
||||
@@ -334,25 +332,9 @@ SendDeltasToControlPlane()
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
InitCurrentDdlTableIfNeeded()
|
||||
{
|
||||
/* Lazy construction of DllHashTable chain */
|
||||
if (SubtransLevel > CurrentDdlTable->subtrans_level)
|
||||
{
|
||||
DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
|
||||
new_table->prev_table = CurrentDdlTable;
|
||||
new_table->subtrans_level = SubtransLevel;
|
||||
new_table->role_table = NULL;
|
||||
new_table->db_table = NULL;
|
||||
CurrentDdlTable = new_table;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
InitDbTableIfNeeded()
|
||||
{
|
||||
InitCurrentDdlTableIfNeeded();
|
||||
if (!CurrentDdlTable->db_table)
|
||||
{
|
||||
HASHCTL db_ctl = {};
|
||||
@@ -371,7 +353,6 @@ InitDbTableIfNeeded()
|
||||
static void
|
||||
InitRoleTableIfNeeded()
|
||||
{
|
||||
InitCurrentDdlTableIfNeeded();
|
||||
if (!CurrentDdlTable->role_table)
|
||||
{
|
||||
HASHCTL role_ctl = {};
|
||||
@@ -390,21 +371,19 @@ InitRoleTableIfNeeded()
|
||||
static void
|
||||
PushTable()
|
||||
{
|
||||
SubtransLevel += 1;
|
||||
DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
|
||||
|
||||
new_table->prev_table = CurrentDdlTable;
|
||||
new_table->role_table = NULL;
|
||||
new_table->db_table = NULL;
|
||||
CurrentDdlTable = new_table;
|
||||
}
|
||||
|
||||
static void
|
||||
MergeTable()
|
||||
{
|
||||
DdlHashTable *old_table;
|
||||
DdlHashTable *old_table = CurrentDdlTable;
|
||||
|
||||
Assert(SubtransLevel >= CurrentDdlTable->subtrans_level);
|
||||
if (--SubtransLevel >= CurrentDdlTable->subtrans_level)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
old_table = CurrentDdlTable;
|
||||
CurrentDdlTable = old_table->prev_table;
|
||||
|
||||
if (old_table->db_table)
|
||||
@@ -497,15 +476,11 @@ MergeTable()
|
||||
static void
|
||||
PopTable()
|
||||
{
|
||||
Assert(SubtransLevel >= CurrentDdlTable->subtrans_level);
|
||||
if (--SubtransLevel < CurrentDdlTable->subtrans_level)
|
||||
{
|
||||
/*
|
||||
* Current table gets freed because it is allocated in aborted
|
||||
* subtransaction's memory context.
|
||||
*/
|
||||
CurrentDdlTable = CurrentDdlTable->prev_table;
|
||||
}
|
||||
/*
|
||||
* Current table gets freed because it is allocated in aborted
|
||||
* subtransaction's memory context.
|
||||
*/
|
||||
CurrentDdlTable = CurrentDdlTable->prev_table;
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -5,8 +5,9 @@ edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
default = ["rest_broker"]
|
||||
testing = ["dep:tokio-postgres"]
|
||||
rest_broker = ["subzero-core", "jsonpath_lib", "ouroboros"]
|
||||
|
||||
[dependencies]
|
||||
ahash.workspace = true
|
||||
@@ -103,6 +104,9 @@ uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
redis.workspace = true
|
||||
zerocopy.workspace = true
|
||||
subzero-core = { git = "https://github.com/neondatabase-labs/subzero", rev = "0b3d3278f5f9ac9311a7280cb1676de80e021f06", features = ["postgresql"], optional = true }
|
||||
jsonpath_lib = { version = "0.3.0", optional = true }
|
||||
ouroboros = { version = "0.18", optional = true }
|
||||
|
||||
# jwt stuff
|
||||
jose-jwa = "0.1.2"
|
||||
|
||||
@@ -170,8 +170,8 @@ Create a configuration file called `local_proxy.json` in the root of the repo (u
|
||||
|
||||
Start the local proxy:
|
||||
```sh
|
||||
cargo run --bin local_proxy -- \
|
||||
--disable_pg_session_jwt true \
|
||||
cargo run --bin local_proxy --features testing -- \
|
||||
--disable-pg-session-jwt \
|
||||
--http 0.0.0.0:7432
|
||||
```
|
||||
|
||||
@@ -180,6 +180,7 @@ Start the auth broker:
|
||||
LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \
|
||||
-c server.crt -k server.key \
|
||||
--is-auth-broker true \
|
||||
--is-rest-broker true \
|
||||
--wss 0.0.0.0:8080 \
|
||||
--http 0.0.0.0:7002 \
|
||||
--auth-backend local
|
||||
@@ -197,3 +198,9 @@ curl -k "https://foo.local.neon.build:8080/sql" \
|
||||
-H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \
|
||||
-d '{"query":"select 1","params":[]}'
|
||||
```
|
||||
|
||||
Make a rest request against the auth broker (rest broker):
|
||||
```sh
|
||||
curl -k "https://foo.local.neon.build:8080/database/rest/v1/items?select=id,name&id=eq.1" \
|
||||
-H "Authorization: Bearer $NEON_JWT"
|
||||
```
|
||||
|
||||
@@ -20,6 +20,9 @@ use crate::auth::backend::jwt::JwkCache;
|
||||
use crate::auth::backend::local::LocalBackend;
|
||||
use crate::auth::{self};
|
||||
use crate::cancellation::CancellationHandler;
|
||||
|
||||
#[cfg(feature = "rest_broker")]
|
||||
use crate::config::RestConfig;
|
||||
use crate::config::{
|
||||
self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
|
||||
refresh_config_loop,
|
||||
@@ -276,6 +279,11 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
accept_jwts: true,
|
||||
console_redirect_confirmation_timeout: Duration::ZERO,
|
||||
},
|
||||
#[cfg(feature = "rest_broker")]
|
||||
rest_config: RestConfig {
|
||||
is_rest_broker: false,
|
||||
db_schema_cache: None,
|
||||
},
|
||||
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
|
||||
@@ -21,7 +21,7 @@ use tokio::net::TcpListener;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{Instrument, error, info, warn};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{project_build_tag, project_git_version};
|
||||
|
||||
@@ -31,6 +31,8 @@ use crate::auth::backend::local::LocalBackend;
|
||||
use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
|
||||
use crate::batch::BatchQueue;
|
||||
use crate::cancellation::{CancellationHandler, CancellationProcessor};
|
||||
#[cfg(feature = "rest_broker")]
|
||||
use crate::config::RestConfig;
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
use crate::config::refresh_config_loop;
|
||||
use crate::config::{
|
||||
@@ -47,6 +49,8 @@ use crate::redis::{elasticache, notifications};
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::GlobalConnPoolOptions;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
#[cfg(feature = "rest_broker")]
|
||||
use crate::serverless::rest::DbSchemaCache;
|
||||
use crate::tls::client_config::compute_client_config_with_root_certs;
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
use crate::url::ApiUrl;
|
||||
@@ -195,9 +199,7 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
project_info_cache: String,
|
||||
/// cache for all valid endpoints
|
||||
// TODO: remove after a couple of releases.
|
||||
#[clap(long, default_value_t = String::new())]
|
||||
#[deprecated]
|
||||
#[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
|
||||
endpoint_cache_config: String,
|
||||
#[clap(flatten)]
|
||||
parquet_upload: ParquetUploadArgs,
|
||||
@@ -246,10 +248,12 @@ struct ProxyCliArgs {
|
||||
|
||||
/// if this is not local proxy, this toggles whether we accept Postgres REST requests
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
#[cfg(feature = "rest_broker")]
|
||||
is_rest_broker: bool,
|
||||
|
||||
/// cache for `db_schema_cache` introspection (use `size=0` to disable)
|
||||
#[clap(long, default_value = "size=1000,ttl=1h")]
|
||||
#[cfg(feature = "rest_broker")]
|
||||
db_schema_cache: String,
|
||||
}
|
||||
|
||||
@@ -517,6 +521,17 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
));
|
||||
maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
|
||||
|
||||
// add a task to flush the db_schema cache every 10 minutes
|
||||
#[cfg(feature = "rest_broker")]
|
||||
if let Some(db_schema_cache) = &config.rest_config.db_schema_cache {
|
||||
maintenance_tasks.spawn(async move {
|
||||
loop {
|
||||
tokio::time::sleep(Duration::from_secs(600)).await;
|
||||
db_schema_cache.flush();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
// TODO: Add gc regardles of the metric collection being enabled.
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||
@@ -560,6 +575,13 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// listen for notifications of new projects/endpoints/branches
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(
|
||||
async move { cache.do_read(client, cancellation_token.clone()).await }.instrument(span),
|
||||
);
|
||||
}
|
||||
|
||||
let maintenance = loop {
|
||||
@@ -677,6 +699,28 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
timeout: Duration::from_secs(2),
|
||||
};
|
||||
|
||||
#[cfg(feature = "rest_broker")]
|
||||
let rest_config = {
|
||||
let db_schema_cache_config: CacheOptions = args.db_schema_cache.parse()?;
|
||||
info!("Using DbSchemaCache with options={db_schema_cache_config:?}");
|
||||
|
||||
let db_schema_cache = if args.is_rest_broker {
|
||||
Some(DbSchemaCache::new(
|
||||
"db_schema_cache",
|
||||
db_schema_cache_config.size,
|
||||
db_schema_cache_config.ttl,
|
||||
true,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
RestConfig {
|
||||
is_rest_broker: args.is_rest_broker,
|
||||
db_schema_cache,
|
||||
}
|
||||
};
|
||||
|
||||
let config = ProxyConfig {
|
||||
tls_config,
|
||||
metric_collection,
|
||||
@@ -689,6 +733,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
connect_to_compute: compute_config,
|
||||
#[cfg(feature = "testing")]
|
||||
disable_pg_session_jwt: false,
|
||||
#[cfg(feature = "rest_broker")]
|
||||
rest_config,
|
||||
};
|
||||
|
||||
let config = Box::leak(Box::new(config));
|
||||
@@ -707,15 +753,18 @@ fn build_auth_backend(
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
let endpoint_cache_config: config::EndpointCacheConfig =
|
||||
args.endpoint_cache_config.parse()?;
|
||||
|
||||
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
|
||||
info!(
|
||||
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
|
||||
);
|
||||
|
||||
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
|
||||
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
|
||||
wake_compute_cache_config,
|
||||
project_info_cache_config,
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
@@ -785,15 +834,18 @@ fn build_auth_backend(
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
let endpoint_cache_config: config::EndpointCacheConfig =
|
||||
args.endpoint_cache_config.parse()?;
|
||||
|
||||
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
|
||||
info!(
|
||||
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
|
||||
);
|
||||
|
||||
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
|
||||
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
|
||||
wake_compute_cache_config,
|
||||
project_info_cache_config,
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
|
||||
283
proxy/src/cache/endpoints.rs
vendored
Normal file
283
proxy/src/cache/endpoints.rs
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
use std::convert::Infallible;
|
||||
use std::future::pending;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use clashmap::ClashSet;
|
||||
use redis::streams::{StreamReadOptions, StreamReadReply};
|
||||
use redis::{AsyncCommands, FromRedisValue, Value};
|
||||
use serde::Deserialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
use crate::config::EndpointCacheConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::ext::LockExt;
|
||||
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
|
||||
use crate::rate_limiter::GlobalRateLimiter;
|
||||
use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use crate::types::EndpointId;
|
||||
|
||||
// TODO: this could be an enum, but events in Redis need to be fixed first.
|
||||
// ProjectCreated was sent with type:branch_created. So we ignore type.
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct ControlPlaneEvent {
|
||||
endpoint_created: Option<EndpointCreated>,
|
||||
branch_created: Option<BranchCreated>,
|
||||
project_created: Option<ProjectCreated>,
|
||||
#[serde(rename = "type")]
|
||||
_type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct EndpointCreated {
|
||||
endpoint_id: EndpointIdInt,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct BranchCreated {
|
||||
branch_id: BranchIdInt,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct ProjectCreated {
|
||||
project_id: ProjectIdInt,
|
||||
}
|
||||
|
||||
impl TryFrom<&Value> for ControlPlaneEvent {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(value: &Value) -> Result<Self, Self::Error> {
|
||||
let json = String::from_redis_value(value)?;
|
||||
Ok(serde_json::from_str(&json)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EndpointsCache {
|
||||
config: EndpointCacheConfig,
|
||||
endpoints: ClashSet<EndpointIdInt>,
|
||||
branches: ClashSet<BranchIdInt>,
|
||||
projects: ClashSet<ProjectIdInt>,
|
||||
ready: AtomicBool,
|
||||
limiter: Arc<Mutex<GlobalRateLimiter>>,
|
||||
}
|
||||
|
||||
impl EndpointsCache {
|
||||
pub(crate) fn new(config: EndpointCacheConfig) -> Self {
|
||||
Self {
|
||||
limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
|
||||
config.limiter_info.clone(),
|
||||
))),
|
||||
config,
|
||||
endpoints: ClashSet::new(),
|
||||
branches: ClashSet::new(),
|
||||
projects: ClashSet::new(),
|
||||
ready: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
// the endpoint cache is not yet fully initialised.
|
||||
return true;
|
||||
}
|
||||
|
||||
if !self.should_reject(endpoint) {
|
||||
ctx.set_rejected(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
// report that we might want to reject this endpoint
|
||||
ctx.set_rejected(true);
|
||||
|
||||
// If cache is disabled, just collect the metrics and return.
|
||||
if self.config.disable_cache {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the limiter allows, we can pretend like it's valid
|
||||
// (incase it is, due to redis channel lag).
|
||||
if self.limiter.lock_propagate_poison().check() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// endpoint not found, and there's too much load.
|
||||
false
|
||||
}
|
||||
|
||||
fn should_reject(&self, endpoint: &EndpointId) -> bool {
|
||||
if endpoint.is_endpoint() {
|
||||
let Some(endpoint) = EndpointIdInt::get(endpoint) else {
|
||||
// if we haven't interned this endpoint, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.endpoints.contains(&endpoint)
|
||||
} else if endpoint.is_branch() {
|
||||
let Some(branch) = BranchIdInt::get(endpoint) else {
|
||||
// if we haven't interned this branch, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.branches.contains(&branch)
|
||||
} else {
|
||||
let Some(project) = ProjectIdInt::get(endpoint) else {
|
||||
// if we haven't interned this project, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.projects.contains(&project)
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_event(&self, event: ControlPlaneEvent) {
|
||||
if let Some(endpoint_created) = event.endpoint_created {
|
||||
self.endpoints.insert(endpoint_created.endpoint_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::EndpointCreated);
|
||||
} else if let Some(branch_created) = event.branch_created {
|
||||
self.branches.insert(branch_created.branch_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::BranchCreated);
|
||||
} else if let Some(project_created) = event.project_created {
|
||||
self.projects.insert(project_created.project_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::ProjectCreated);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn do_read(
|
||||
&self,
|
||||
mut con: ConnectionWithCredentialsProvider,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<Infallible> {
|
||||
let mut last_id = "0-0".to_string();
|
||||
loop {
|
||||
if let Err(e) = con.connect().await {
|
||||
tracing::error!("error connecting to redis: {:?}", e);
|
||||
self.ready.store(false, Ordering::Release);
|
||||
}
|
||||
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
|
||||
tracing::error!("error reading from redis: {:?}", e);
|
||||
self.ready.store(false, Ordering::Release);
|
||||
}
|
||||
if cancellation_token.is_cancelled() {
|
||||
info!("cancellation token is cancelled, exiting");
|
||||
// Maintenance tasks run forever. Sleep forever when canceled.
|
||||
pending::<()>().await;
|
||||
}
|
||||
tokio::time::sleep(self.config.retry_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_from_stream(
|
||||
&self,
|
||||
con: &mut ConnectionWithCredentialsProvider,
|
||||
last_id: &mut String,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!("reading endpoints/branches/projects from redis");
|
||||
self.batch_read(
|
||||
con,
|
||||
StreamReadOptions::default().count(self.config.initial_batch_size),
|
||||
last_id,
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
tracing::info!("ready to filter user requests");
|
||||
self.ready.store(true, Ordering::Release);
|
||||
self.batch_read(
|
||||
con,
|
||||
StreamReadOptions::default()
|
||||
.count(self.config.default_batch_size)
|
||||
.block(self.config.xread_timeout.as_millis() as usize),
|
||||
last_id,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn batch_read(
|
||||
&self,
|
||||
conn: &mut ConnectionWithCredentialsProvider,
|
||||
opts: StreamReadOptions,
|
||||
last_id: &mut String,
|
||||
return_when_finish: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut total: usize = 0;
|
||||
loop {
|
||||
let mut res: StreamReadReply = conn
|
||||
.xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
|
||||
.await?;
|
||||
|
||||
if res.keys.is_empty() {
|
||||
if return_when_finish {
|
||||
if total != 0 {
|
||||
break;
|
||||
}
|
||||
anyhow::bail!(
|
||||
"Redis stream {} is empty, cannot be used to filter endpoints",
|
||||
self.config.stream_name
|
||||
);
|
||||
}
|
||||
// If we are not returning when finish, we should wait for more data.
|
||||
continue;
|
||||
}
|
||||
if res.keys.len() != 1 {
|
||||
anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
|
||||
}
|
||||
|
||||
let key = res.keys.pop().expect("Checked length above");
|
||||
let len = key.ids.len();
|
||||
for stream_id in key.ids {
|
||||
total += 1;
|
||||
for value in stream_id.map.values() {
|
||||
match value.try_into() {
|
||||
Ok(event) => self.insert_event(event),
|
||||
Err(err) => {
|
||||
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
|
||||
channel: &self.config.stream_name,
|
||||
});
|
||||
tracing::error!("error parsing value {value:?}: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if total.is_power_of_two() {
|
||||
tracing::debug!("endpoints read {}", total);
|
||||
}
|
||||
*last_id = stream_id.id;
|
||||
}
|
||||
if return_when_finish && len <= self.config.default_batch_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
tracing::info!("read {} endpoints/branches/projects from redis", total);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_control_plane_event() {
|
||||
let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
|
||||
|
||||
let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
|
||||
ControlPlaneEvent {
|
||||
endpoint_created: Some(EndpointCreated {
|
||||
endpoint_id: endpoint_id.into(),
|
||||
}),
|
||||
branch_created: None,
|
||||
project_created: None,
|
||||
_type: Some("endpoint_created".into()),
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
1
proxy/src/cache/mod.rs
vendored
1
proxy/src/cache/mod.rs
vendored
@@ -1,4 +1,5 @@
|
||||
pub(crate) mod common;
|
||||
pub(crate) mod endpoints;
|
||||
pub(crate) mod project_info;
|
||||
mod timed_lru;
|
||||
|
||||
|
||||
26
proxy/src/cache/timed_lru.rs
vendored
26
proxy/src/cache/timed_lru.rs
vendored
@@ -204,6 +204,10 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
self.insert_raw_ttl(key, value, ttl, false);
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&self, key: K, value: V) {
|
||||
self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval);
|
||||
}
|
||||
|
||||
pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
|
||||
let (_, old) = self.insert_raw(key.clone(), value);
|
||||
|
||||
@@ -214,6 +218,28 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
|
||||
(old, cached)
|
||||
}
|
||||
|
||||
pub(crate) fn flush(&self) {
|
||||
let now = Instant::now();
|
||||
let mut cache = self.cache.lock();
|
||||
|
||||
// Collect keys of expired entries first
|
||||
let expired_keys: Vec<_> = cache
|
||||
.iter()
|
||||
.filter_map(|(key, entry)| {
|
||||
if entry.expires_at <= now {
|
||||
Some(key.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Remove expired entries
|
||||
for key in expired_keys {
|
||||
cache.remove(&key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
|
||||
|
||||
@@ -18,10 +18,12 @@ use crate::control_plane::locks::ApiLocks;
|
||||
use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
|
||||
use crate::ext::TaskExt;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig};
|
||||
use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::GlobalConnPoolOptions;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
#[cfg(feature = "rest_broker")]
|
||||
use crate::serverless::rest::DbSchemaCache;
|
||||
pub use crate::tls::server_config::{TlsConfig, configure_tls};
|
||||
use crate::types::{Host, RoleName};
|
||||
|
||||
@@ -30,6 +32,8 @@ pub struct ProxyConfig {
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
pub http_config: HttpConfig,
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
#[cfg(feature = "rest_broker")]
|
||||
pub rest_config: RestConfig,
|
||||
pub proxy_protocol_v2: ProxyProtocolV2,
|
||||
pub handshake_timeout: Duration,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
@@ -80,6 +84,85 @@ pub struct AuthenticationConfig {
|
||||
pub console_redirect_confirmation_timeout: tokio::time::Duration,
|
||||
}
|
||||
|
||||
#[cfg(feature = "rest_broker")]
|
||||
pub struct RestConfig {
|
||||
pub is_rest_broker: bool,
|
||||
pub db_schema_cache: Option<DbSchemaCache>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct EndpointCacheConfig {
|
||||
/// Batch size to receive all endpoints on the startup.
|
||||
pub initial_batch_size: usize,
|
||||
/// Batch size to receive endpoints.
|
||||
pub default_batch_size: usize,
|
||||
/// Timeouts for the stream read operation.
|
||||
pub xread_timeout: Duration,
|
||||
/// Stream name to read from.
|
||||
pub stream_name: String,
|
||||
/// Limiter info (to distinguish when to enable cache).
|
||||
pub limiter_info: Vec<RateBucketInfo>,
|
||||
/// Disable cache.
|
||||
/// If true, cache is ignored, but reports all statistics.
|
||||
pub disable_cache: bool,
|
||||
/// Retry interval for the stream read operation.
|
||||
pub retry_interval: Duration,
|
||||
}
|
||||
|
||||
impl EndpointCacheConfig {
|
||||
/// Default options for [`crate::control_plane::NodeInfoCache`].
|
||||
/// Notice that by default the limiter is empty, which means that cache is disabled.
|
||||
pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
|
||||
|
||||
/// Parse cache options passed via cmdline.
|
||||
/// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
|
||||
fn parse(options: &str) -> anyhow::Result<Self> {
|
||||
let mut initial_batch_size = None;
|
||||
let mut default_batch_size = None;
|
||||
let mut xread_timeout = None;
|
||||
let mut stream_name = None;
|
||||
let mut limiter_info = vec![];
|
||||
let mut disable_cache = false;
|
||||
let mut retry_interval = None;
|
||||
|
||||
for option in options.split(',') {
|
||||
let (key, value) = option
|
||||
.split_once('=')
|
||||
.with_context(|| format!("bad key-value pair: {option}"))?;
|
||||
|
||||
match key {
|
||||
"initial_batch_size" => initial_batch_size = Some(value.parse()?),
|
||||
"default_batch_size" => default_batch_size = Some(value.parse()?),
|
||||
"xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
|
||||
"stream_name" => stream_name = Some(value.to_string()),
|
||||
"limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
|
||||
"disable_cache" => disable_cache = value.parse()?,
|
||||
"retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
|
||||
unknown => bail!("unknown key: {unknown}"),
|
||||
}
|
||||
}
|
||||
RateBucketInfo::validate(&mut limiter_info)?;
|
||||
|
||||
Ok(Self {
|
||||
initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
|
||||
default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
|
||||
xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
|
||||
stream_name: stream_name.context("missing `stream_name`")?,
|
||||
disable_cache,
|
||||
limiter_info,
|
||||
retry_interval: retry_interval.context("missing `retry_interval`")?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for EndpointCacheConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(options: &str) -> Result<Self, Self::Err> {
|
||||
let error = || format!("failed to parse endpoint cache options '{options}'");
|
||||
Self::parse(options).with_context(error)
|
||||
}
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct MetricBackupCollectionConfig {
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
@@ -7,7 +7,7 @@ use once_cell::sync::OnceCell;
|
||||
use smol_str::SmolStr;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::field::display;
|
||||
use tracing::{Span, error, info_span};
|
||||
use tracing::{Span, debug, error, info_span};
|
||||
use try_lock::TryLock;
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -15,7 +15,10 @@ use self::parquet::RequestData;
|
||||
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::intern::{BranchIdInt, ProjectIdInt};
|
||||
use crate::metrics::{LatencyAccumulated, LatencyTimer, Metrics, Protocol, Waiting};
|
||||
use crate::metrics::{
|
||||
ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
|
||||
Waiting,
|
||||
};
|
||||
use crate::pqproto::StartupMessageParams;
|
||||
use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
|
||||
use crate::types::{DbName, EndpointId, RoleName};
|
||||
@@ -67,6 +70,8 @@ struct RequestContextInner {
|
||||
// This sender is only used to log the length of session in case of success.
|
||||
disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
pub(crate) latency_timer: LatencyTimer,
|
||||
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
|
||||
rejected: Option<bool>,
|
||||
disconnect_timestamp: Option<chrono::DateTime<Utc>>,
|
||||
}
|
||||
|
||||
@@ -101,6 +106,7 @@ impl Clone for RequestContext {
|
||||
auth_method: inner.auth_method.clone(),
|
||||
jwt_issuer: inner.jwt_issuer.clone(),
|
||||
success: inner.success,
|
||||
rejected: inner.rejected,
|
||||
cold_start_info: inner.cold_start_info,
|
||||
pg_options: inner.pg_options.clone(),
|
||||
testodrome_query_id: inner.testodrome_query_id.clone(),
|
||||
@@ -145,6 +151,7 @@ impl RequestContext {
|
||||
auth_method: None,
|
||||
jwt_issuer: None,
|
||||
success: false,
|
||||
rejected: None,
|
||||
cold_start_info: ColdStartInfo::Unknown,
|
||||
pg_options: None,
|
||||
testodrome_query_id: None,
|
||||
@@ -176,6 +183,11 @@ impl RequestContext {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn set_rejected(&self, rejected: bool) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.rejected = Some(rejected);
|
||||
}
|
||||
|
||||
pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
|
||||
self.0
|
||||
.try_lock()
|
||||
@@ -449,6 +461,38 @@ impl RequestContextInner {
|
||||
}
|
||||
|
||||
fn log_connect(&mut self) {
|
||||
let outcome = if self.success {
|
||||
ConnectOutcome::Success
|
||||
} else {
|
||||
ConnectOutcome::Failed
|
||||
};
|
||||
|
||||
// TODO: get rid of entirely/refactor
|
||||
// check for false positives
|
||||
// AND false negatives
|
||||
if let Some(rejected) = self.rejected {
|
||||
let ep = self
|
||||
.endpoint_id
|
||||
.as_ref()
|
||||
.map(|x| x.as_str())
|
||||
.unwrap_or_default();
|
||||
// This makes sense only if cache is disabled
|
||||
debug!(
|
||||
?outcome,
|
||||
?rejected,
|
||||
?ep,
|
||||
"check endpoint is valid with outcome"
|
||||
);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.invalid_endpoints_total
|
||||
.inc(InvalidEndpointsGroup {
|
||||
protocol: self.protocol,
|
||||
rejected: rejected.into(),
|
||||
outcome,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(tx) = self.sender.take() {
|
||||
// If type changes, this error handling needs to be updated.
|
||||
let tx: mpsc::UnboundedSender<RequestData> = tx;
|
||||
|
||||
@@ -159,6 +159,13 @@ impl NeonControlPlaneClient {
|
||||
ctx: &RequestContext,
|
||||
endpoint: &EndpointId,
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
if !self
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &endpoint.normalize())
|
||||
{
|
||||
return Err(GetEndpointJwksError::EndpointNotFound);
|
||||
}
|
||||
let request_id = ctx.session_id().to_string();
|
||||
async {
|
||||
let request = self
|
||||
@@ -293,6 +300,11 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
|
||||
return Ok(secret);
|
||||
}
|
||||
|
||||
if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
|
||||
info!("endpoint is not valid, skipping the request");
|
||||
return Err(GetAuthInfoError::UnknownEndpoint);
|
||||
}
|
||||
|
||||
let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
|
||||
|
||||
let control = EndpointAccessControl {
|
||||
@@ -334,6 +346,11 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
|
||||
return Ok(control);
|
||||
}
|
||||
|
||||
if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
|
||||
info!("endpoint is not valid, skipping the request");
|
||||
return Err(GetAuthInfoError::UnknownEndpoint);
|
||||
}
|
||||
|
||||
let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
|
||||
|
||||
let control = EndpointAccessControl {
|
||||
|
||||
@@ -8,13 +8,14 @@ use std::time::Duration;
|
||||
|
||||
use clashmap::ClashMap;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
use tracing::debug;
|
||||
|
||||
use super::{EndpointAccessControl, RoleAccessControl};
|
||||
use crate::auth::backend::ComputeUserInfo;
|
||||
use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
|
||||
use crate::cache::endpoints::EndpointsCache;
|
||||
use crate::cache::project_info::ProjectInfoCacheImpl;
|
||||
use crate::config::{CacheOptions, ProjectInfoCacheOptions};
|
||||
use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::{CachedNodeInfo, ControlPlaneApi, NodeInfoCache, errors};
|
||||
use crate::error::ReportableError;
|
||||
@@ -120,12 +121,15 @@ pub struct ApiCaches {
|
||||
pub(crate) node_info: NodeInfoCache,
|
||||
/// Cache which stores project_id -> endpoint_ids mapping.
|
||||
pub project_info: Arc<ProjectInfoCacheImpl>,
|
||||
/// List of all valid endpoints.
|
||||
pub endpoints_cache: Arc<EndpointsCache>,
|
||||
}
|
||||
|
||||
impl ApiCaches {
|
||||
pub fn new(
|
||||
wake_compute_cache_config: CacheOptions,
|
||||
project_info_cache_config: ProjectInfoCacheOptions,
|
||||
endpoint_cache_config: EndpointCacheConfig,
|
||||
) -> Self {
|
||||
Self {
|
||||
node_info: NodeInfoCache::new(
|
||||
@@ -135,6 +139,7 @@ impl ApiCaches {
|
||||
true,
|
||||
),
|
||||
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
|
||||
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -229,7 +234,8 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
|
||||
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
|
||||
// therefore releasing it is safe from race conditions
|
||||
info!(
|
||||
debug!(
|
||||
//FIXME: is anything depending on this being info?
|
||||
name = self.name,
|
||||
shard = i,
|
||||
"performing epoch reclamation on api lock"
|
||||
|
||||
@@ -99,6 +99,10 @@ pub(crate) enum GetAuthInfoError {
|
||||
|
||||
#[error(transparent)]
|
||||
ApiError(ControlPlaneError),
|
||||
|
||||
/// Proxy does not know about the endpoint in advanced
|
||||
#[error("endpoint not found in endpoint cache")]
|
||||
UnknownEndpoint,
|
||||
}
|
||||
|
||||
// This allows more useful interactions than `#[from]`.
|
||||
@@ -115,6 +119,8 @@ impl UserFacingError for GetAuthInfoError {
|
||||
Self::BadSecret => REQUEST_FAILED.to_owned(),
|
||||
// However, API might return a meaningful error.
|
||||
Self::ApiError(e) => e.to_string_client(),
|
||||
// pretend like control plane returned an error.
|
||||
Self::UnknownEndpoint => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -124,6 +130,8 @@ impl ReportableError for GetAuthInfoError {
|
||||
match self {
|
||||
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
|
||||
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
|
||||
// we only apply endpoint filtering if control plane is under high load.
|
||||
Self::UnknownEndpoint => crate::error::ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -192,6 +200,9 @@ impl CouldRetry for WakeComputeError {
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum GetEndpointJwksError {
|
||||
#[error("endpoint not found")]
|
||||
EndpointNotFound,
|
||||
|
||||
#[error("failed to build control plane request: {0}")]
|
||||
RequestBuild(#[source] reqwest::Error),
|
||||
|
||||
|
||||
@@ -16,6 +16,44 @@ use super::LeakyBucketConfig;
|
||||
use crate::ext::LockExt;
|
||||
use crate::intern::EndpointIdInt;
|
||||
|
||||
pub struct GlobalRateLimiter {
|
||||
data: Vec<RateBucket>,
|
||||
info: Vec<RateBucketInfo>,
|
||||
}
|
||||
|
||||
impl GlobalRateLimiter {
|
||||
pub fn new(info: Vec<RateBucketInfo>) -> Self {
|
||||
Self {
|
||||
data: vec![
|
||||
RateBucket {
|
||||
start: Instant::now(),
|
||||
count: 0,
|
||||
};
|
||||
info.len()
|
||||
],
|
||||
info,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check that number of connections is below `max_rps` rps.
|
||||
pub fn check(&mut self) -> bool {
|
||||
let now = Instant::now();
|
||||
|
||||
let should_allow_request = self
|
||||
.data
|
||||
.iter_mut()
|
||||
.zip(&self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
self.data.iter_mut().for_each(|b| b.inc(1));
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
}
|
||||
}
|
||||
|
||||
// Simple per-endpoint rate limiter.
|
||||
//
|
||||
// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
|
||||
@@ -8,4 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd;
|
||||
pub(crate) use limit_algorithm::{
|
||||
DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
|
||||
};
|
||||
pub use limiter::{RateBucketInfo, WakeComputeRateLimiter};
|
||||
pub use limiter::{GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
|
||||
@@ -11,6 +11,8 @@ mod http_conn_pool;
|
||||
mod http_util;
|
||||
mod json;
|
||||
mod local_conn_pool;
|
||||
#[cfg(feature = "rest_broker")]
|
||||
pub mod rest;
|
||||
mod sql_over_http;
|
||||
mod websocket;
|
||||
|
||||
@@ -487,6 +489,37 @@ async fn request_handler(
|
||||
.body(Empty::new().map_err(|x| match x {}).boxed())
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
} else {
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
#[cfg(feature = "rest_broker")]
|
||||
{
|
||||
if config.rest_config.is_rest_broker && {
|
||||
let path_parts: Vec<&str> = request.uri().path().split('/').collect();
|
||||
path_parts.len() >= 3 && path_parts[2].starts_with("rest")
|
||||
} {
|
||||
let ctx =
|
||||
RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http);
|
||||
let span = ctx.span();
|
||||
|
||||
let testodrome_id = request
|
||||
.headers()
|
||||
.get("X-Neon-Query-ID")
|
||||
.and_then(|value| value.to_str().ok())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
if let Some(query_id) = testodrome_id {
|
||||
info!(parent: &ctx.span(), "testodrome query ID: {query_id}");
|
||||
ctx.set_testodrome_id(query_id.into());
|
||||
}
|
||||
|
||||
rest::handle(config, ctx, request, backend, http_cancellation_token)
|
||||
.instrument(span)
|
||||
.await
|
||||
} else {
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "rest_broker"))]
|
||||
{
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1192
proxy/src/serverless/rest.rs
Normal file
1192
proxy/src/serverless/rest.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -107,3 +107,13 @@ smol_str_wrapper!(DbName);
|
||||
|
||||
// postgres hostname, will likely be a port:ip addr
|
||||
smol_str_wrapper!(Host);
|
||||
|
||||
// Endpoints are a bit tricky. Rare they might be branches or projects.
|
||||
impl EndpointId {
|
||||
pub(crate) fn is_endpoint(&self) -> bool {
|
||||
self.0.starts_with("ep-")
|
||||
}
|
||||
pub(crate) fn is_branch(&self) -> bool {
|
||||
self.0.starts_with("br-")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ impl StateSK {
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: result.previous_conf,
|
||||
current_conf: result.current_conf,
|
||||
last_log_term: self.state().acceptor_state.term,
|
||||
term: self.state().acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -6,11 +6,6 @@ use utils::id::NodeId;
|
||||
|
||||
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct Delete {
|
||||
pub(crate) node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct Drain {
|
||||
pub(crate) node_id: NodeId,
|
||||
@@ -23,7 +18,6 @@ pub(crate) struct Fill {
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) enum Operation {
|
||||
Delete(Delete),
|
||||
Drain(Drain),
|
||||
Fill(Fill),
|
||||
}
|
||||
@@ -36,8 +30,6 @@ pub(crate) enum OperationError {
|
||||
FinalizeError(Cow<'static, str>),
|
||||
#[error("Operation cancelled")]
|
||||
Cancelled,
|
||||
#[error("Impossible constraint error: {0}")]
|
||||
ImpossibleConstraint(Cow<'static, str>),
|
||||
}
|
||||
|
||||
pub(crate) struct OperationHandler {
|
||||
@@ -46,12 +38,6 @@ pub(crate) struct OperationHandler {
|
||||
pub(crate) cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Display for Delete {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "delete {}", self.node_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Drain {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "drain {}", self.node_id)
|
||||
@@ -67,7 +53,6 @@ impl Display for Fill {
|
||||
impl Display for Operation {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Operation::Delete(op) => write!(f, "{op}"),
|
||||
Operation::Drain(op) => write!(f, "{op}"),
|
||||
Operation::Fill(op) => write!(f, "{op}"),
|
||||
}
|
||||
|
||||
@@ -10,19 +10,63 @@ use crate::node::Node;
|
||||
use crate::scheduler::Scheduler;
|
||||
use crate::tenant_shard::TenantShard;
|
||||
|
||||
pub(crate) struct TenantShardIterator<F> {
|
||||
tenants_accessor: F,
|
||||
inspected_all_shards: bool,
|
||||
last_inspected_shard: Option<TenantShardId>,
|
||||
}
|
||||
|
||||
/// A simple iterator which can be used in tandem with [`crate::service::Service`]
|
||||
/// to iterate over all known tenant shard ids without holding the lock on the
|
||||
/// service state at all times.
|
||||
impl<F> TenantShardIterator<F>
|
||||
where
|
||||
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
|
||||
{
|
||||
pub(crate) fn new(tenants_accessor: F) -> Self {
|
||||
Self {
|
||||
tenants_accessor,
|
||||
inspected_all_shards: false,
|
||||
last_inspected_shard: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the next tenant shard id if one exists
|
||||
pub(crate) fn next(&mut self) -> Option<TenantShardId> {
|
||||
if self.inspected_all_shards {
|
||||
return None;
|
||||
}
|
||||
|
||||
match (self.tenants_accessor)(self.last_inspected_shard) {
|
||||
Some(tid) => {
|
||||
self.last_inspected_shard = Some(tid);
|
||||
Some(tid)
|
||||
}
|
||||
None => {
|
||||
self.inspected_all_shards = true;
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true when the end of the iterator is reached and false otherwise
|
||||
pub(crate) fn finished(&self) -> bool {
|
||||
self.inspected_all_shards
|
||||
}
|
||||
}
|
||||
|
||||
/// Check that the state of the node being drained is as expected:
|
||||
/// node is present in memory and scheduling policy is set to expected_policy
|
||||
/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
|
||||
pub(crate) fn validate_node_state(
|
||||
node_id: &NodeId,
|
||||
nodes: Arc<HashMap<NodeId, Node>>,
|
||||
expected_policy: NodeSchedulingPolicy,
|
||||
) -> Result<(), OperationError> {
|
||||
let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
|
||||
format!("node {node_id} was removed").into(),
|
||||
))?;
|
||||
|
||||
let current_policy = node.get_scheduling();
|
||||
if current_policy != expected_policy {
|
||||
if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
|
||||
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
|
||||
// about it
|
||||
return Err(OperationError::NodeStateChanged(
|
||||
@@ -138,3 +182,55 @@ impl TenantShardDrain {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
|
||||
use super::TenantShardIterator;
|
||||
|
||||
#[test]
|
||||
fn test_tenant_shard_iterator() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_count = ShardCount(8);
|
||||
|
||||
let mut tenant_shards = Vec::default();
|
||||
for i in 0..shard_count.0 {
|
||||
tenant_shards.push((
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(i),
|
||||
shard_count,
|
||||
},
|
||||
(),
|
||||
))
|
||||
}
|
||||
|
||||
let tenant_shards = Arc::new(tenant_shards);
|
||||
|
||||
let mut tid_iter = TenantShardIterator::new({
|
||||
let tenants = tenant_shards.clone();
|
||||
move |last_inspected_shard: Option<TenantShardId>| {
|
||||
let entry = match last_inspected_shard {
|
||||
Some(skip_past) => {
|
||||
let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
|
||||
cursor.nth(1)
|
||||
}
|
||||
None => tenants.first(),
|
||||
};
|
||||
|
||||
entry.map(|(tid, _)| tid).copied()
|
||||
}
|
||||
});
|
||||
|
||||
let mut iterated_over = Vec::default();
|
||||
while let Some(tid) = tid_iter.next() {
|
||||
iterated_over.push((tid, ()));
|
||||
}
|
||||
|
||||
assert_eq!(iterated_over, *tenant_shards);
|
||||
}
|
||||
}
|
||||
@@ -919,7 +919,7 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_node_delete_old(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
@@ -931,10 +931,7 @@ async fn handle_node_delete_old(req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_delete_old(node_id).await?,
|
||||
)
|
||||
json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tombstone_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -1054,42 +1051,6 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
json_response(StatusCode::OK, leader)
|
||||
}
|
||||
|
||||
async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.start_node_delete(node_id).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_cancel_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Infra)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
json_response(
|
||||
StatusCode::ACCEPTED,
|
||||
state.service.cancel_node_delete(node_id).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Infra)?;
|
||||
|
||||
@@ -2260,14 +2221,8 @@ pub fn make_router(
|
||||
.post("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
|
||||
})
|
||||
// This endpoint is deprecated and will be removed in a future version.
|
||||
// Use PUT /control/v1/node/:node_id/delete instead.
|
||||
.delete("/control/v1/node/:node_id", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_delete_old,
|
||||
RequestName("control_v1_node_delete"),
|
||||
)
|
||||
named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
|
||||
})
|
||||
.get("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
|
||||
@@ -2292,20 +2247,6 @@ pub fn make_router(
|
||||
.get("/control/v1/leader", |r| {
|
||||
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
|
||||
})
|
||||
.put("/control/v1/node/:node_id/delete", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_delete,
|
||||
RequestName("control_v1_start_node_delete"),
|
||||
)
|
||||
})
|
||||
.delete("/control/v1/node/:node_id/delete", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_cancel_node_delete,
|
||||
RequestName("control_v1_cancel_node_delete"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/node/:node_id/drain", |r| {
|
||||
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
|
||||
})
|
||||
@@ -2371,7 +2312,7 @@ pub fn make_router(
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_scheduling_policy,
|
||||
RequestName("v1_safekeeper_scheduling_policy"),
|
||||
RequestName("v1_safekeeper_status"),
|
||||
)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
|
||||
@@ -6,13 +6,13 @@ extern crate hyper0 as hyper;
|
||||
mod auth;
|
||||
mod background_node_operations;
|
||||
mod compute_hook;
|
||||
mod drain_utils;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
mod leadership;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
mod operation_utils;
|
||||
mod pageserver_client;
|
||||
mod peer_client;
|
||||
pub mod persistence;
|
||||
|
||||
@@ -201,7 +201,6 @@ impl Node {
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
|
||||
NodeSchedulingPolicy::Deleting => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
|
||||
@@ -635,23 +635,18 @@ impl Persistence {
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
||||
Box::pin(async move {
|
||||
let node: Option<NodePersistence> = nodes
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.first::<NodePersistence>(conn)
|
||||
.await
|
||||
.optional()?;
|
||||
|
||||
// Check if the node is not marked as deleted
|
||||
match node {
|
||||
Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
|
||||
)));
|
||||
}
|
||||
_ => {
|
||||
// go through
|
||||
}
|
||||
};
|
||||
let deleted_node: i64 = nodes
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.filter(lifecycle.eq(String::from(NodeLifecycle::Deleted)))
|
||||
.count()
|
||||
.get_result(conn)
|
||||
.await?;
|
||||
if deleted_node > 0 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
|
||||
)));
|
||||
}
|
||||
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(input_node_id.0 as i64))
|
||||
@@ -669,23 +664,21 @@ impl Persistence {
|
||||
.load(conn)
|
||||
.await?;
|
||||
|
||||
if let Some(node) = node {
|
||||
let old_scheduling_policy =
|
||||
NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap();
|
||||
let new_scheduling_policy = match old_scheduling_policy {
|
||||
NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active,
|
||||
NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active,
|
||||
NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active,
|
||||
NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active,
|
||||
NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause,
|
||||
NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause,
|
||||
};
|
||||
diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set(scheduling_policy.eq(String::from(new_scheduling_policy)))
|
||||
.execute(conn)
|
||||
.await?;
|
||||
}
|
||||
// If the node went through a drain and restart phase before re-attaching,
|
||||
// then reset it's node scheduling policy to active.
|
||||
diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.filter(
|
||||
scheduling_policy
|
||||
.eq(String::from(NodeSchedulingPolicy::PauseForRestart))
|
||||
.or(scheduling_policy
|
||||
.eq(String::from(NodeSchedulingPolicy::Draining)))
|
||||
.or(scheduling_policy
|
||||
.eq(String::from(NodeSchedulingPolicy::Filling))),
|
||||
)
|
||||
.set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
|
||||
.execute(conn)
|
||||
.await?;
|
||||
|
||||
Ok(updated)
|
||||
})
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
pub mod chaos_injector;
|
||||
mod context_iterator;
|
||||
pub mod feature_flag;
|
||||
pub(crate) mod safekeeper_reconciler;
|
||||
mod safekeeper_service;
|
||||
mod tenant_shard_iterator;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -16,6 +16,7 @@ use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use anyhow::Context;
|
||||
use context_iterator::TenantShardContextIterator;
|
||||
use control_plane::storage_controller::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
};
|
||||
@@ -54,7 +55,6 @@ use pageserver_client::{BlockUnblock, mgmt_api};
|
||||
use reqwest::{Certificate, StatusCode};
|
||||
use safekeeper_api::models::SafekeeperUtilization;
|
||||
use safekeeper_reconciler::SafekeeperReconcilers;
|
||||
use tenant_shard_iterator::{TenantShardExclusiveIterator, create_shared_shard_iterator};
|
||||
use tokio::sync::TryAcquireError;
|
||||
use tokio::sync::mpsc::error::TrySendError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -68,9 +68,10 @@ use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::{failpoint_support, pausable_failpoint};
|
||||
|
||||
use crate::background_node_operations::{
|
||||
Delete, Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
|
||||
Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
|
||||
};
|
||||
use crate::compute_hook::{self, ComputeHook, NotifyError};
|
||||
use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator};
|
||||
use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState};
|
||||
use crate::id_lock_map::{
|
||||
IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock,
|
||||
@@ -78,7 +79,6 @@ use crate::id_lock_map::{
|
||||
use crate::leadership::Leadership;
|
||||
use crate::metrics;
|
||||
use crate::node::{AvailabilityTransition, Node};
|
||||
use crate::operation_utils::{self, TenantShardDrain};
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::peer_client::GlobalObservedState;
|
||||
use crate::persistence::split_state::SplitState;
|
||||
@@ -105,7 +105,7 @@ use crate::timeline_import::{
|
||||
TimelineImportFinalizeError, TimelineImportState, UpcallClient,
|
||||
};
|
||||
|
||||
const WAITER_OPERATION_POLL_TIMEOUT: Duration = Duration::from_millis(500);
|
||||
const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
@@ -581,9 +581,7 @@ impl From<ReconcileWaitError> for ApiError {
|
||||
impl From<OperationError> for ApiError {
|
||||
fn from(value: OperationError) -> Self {
|
||||
match value {
|
||||
OperationError::NodeStateChanged(err)
|
||||
| OperationError::FinalizeError(err)
|
||||
| OperationError::ImpossibleConstraint(err) => {
|
||||
OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(err))
|
||||
}
|
||||
OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
|
||||
@@ -2416,7 +2414,6 @@ impl Service {
|
||||
NodeSchedulingPolicy::PauseForRestart
|
||||
| NodeSchedulingPolicy::Draining
|
||||
| NodeSchedulingPolicy::Filling
|
||||
| NodeSchedulingPolicy::Deleting
|
||||
);
|
||||
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
@@ -7058,7 +7055,7 @@ impl Service {
|
||||
/// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
|
||||
/// that we don't leave any bad state behind in the storage controller, but unclean
|
||||
/// in the sense that we are not carefully draining the node.
|
||||
pub(crate) async fn node_delete_old(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let _node_lock =
|
||||
trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;
|
||||
|
||||
@@ -7092,7 +7089,7 @@ impl Service {
|
||||
}
|
||||
|
||||
for (_tenant_id, mut schedule_context, shards) in
|
||||
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
|
||||
{
|
||||
for shard in shards {
|
||||
if shard.deref_node(node_id) {
|
||||
@@ -7161,171 +7158,6 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_node(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
policy_on_start: NodeSchedulingPolicy,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), OperationError> {
|
||||
let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build();
|
||||
|
||||
let mut waiters: Vec<ReconcilerWaiter> = Vec::new();
|
||||
let mut tid_iter = create_shared_shard_iterator(self.clone());
|
||||
|
||||
while !tid_iter.finished() {
|
||||
if cancel.is_cancelled() {
|
||||
match self
|
||||
.node_configure(node_id, None, Some(policy_on_start))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
|
||||
node_id, String::from(policy_on_start), err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
operation_utils::validate_node_state(
|
||||
&node_id,
|
||||
self.inner.read().unwrap().nodes.clone(),
|
||||
NodeSchedulingPolicy::Deleting,
|
||||
)?;
|
||||
|
||||
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
|
||||
let tid = match tid_iter.next() {
|
||||
Some(tid) => tid,
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let tenant_shard = match tenants.get_mut(&tid) {
|
||||
Some(tenant_shard) => tenant_shard,
|
||||
None => {
|
||||
// Tenant shard was deleted by another operation. Skip it.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match tenant_shard.get_scheduling_policy() {
|
||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {
|
||||
// A migration during delete is classed as 'essential' because it is required to
|
||||
// uphold our availability goals for the tenant: this shard is elegible for migration.
|
||||
}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// If we have been asked to avoid rescheduling this shard, then do not migrate it during a deletion
|
||||
tracing::warn!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
"Skip migration during deletion because shard scheduling policy {:?} disallows it",
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if tenant_shard.deref_node(node_id) {
|
||||
// TODO(ephemeralsad): we should process all shards in a tenant at once, so
|
||||
// we can avoid settling the tenant unevenly.
|
||||
let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal);
|
||||
if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
tracing::error!(
|
||||
"Refusing to delete node, shard {} can't be rescheduled: {e}",
|
||||
tenant_shard.tenant_shard_id
|
||||
);
|
||||
return Err(OperationError::ImpossibleConstraint(e.to_string().into()));
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Rescheduled shard {} away from node during deletion",
|
||||
tenant_shard.tenant_shard_id
|
||||
)
|
||||
}
|
||||
|
||||
let waiter = self.maybe_configured_reconcile_shard(
|
||||
tenant_shard,
|
||||
nodes,
|
||||
reconciler_config,
|
||||
);
|
||||
if let Some(some) = waiter {
|
||||
waiters.push(some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
|
||||
.await;
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleepy-delete-loop", &cancel);
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
if cancel.is_cancelled() {
|
||||
match self
|
||||
.node_configure(node_id, None, Some(policy_on_start))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}",
|
||||
node_id, String::from(policy_on_start), err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Awaiting {} pending delete reconciliations", waiters.len());
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
self.persistence
|
||||
.set_tombstone(node_id)
|
||||
.await
|
||||
.map_err(|e| OperationError::FinalizeError(e.to_string().into()))?;
|
||||
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, _, scheduler) = locked.parts_mut();
|
||||
|
||||
scheduler.node_remove(node_id);
|
||||
|
||||
let mut nodes_mut = (**nodes).clone();
|
||||
if let Some(mut removed_node) = nodes_mut.remove(&node_id) {
|
||||
// Ensure that any reconciler holding an Arc<> to this node will
|
||||
// drop out when trying to RPC to it (setting Offline state sets the
|
||||
// cancellation token on the Node object).
|
||||
removed_node.set_availability(NodeAvailability::Offline);
|
||||
}
|
||||
*nodes = Arc::new(nodes_mut);
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(nodes.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_pageserver_nodes
|
||||
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
|
||||
let nodes = {
|
||||
self.inner
|
||||
@@ -7714,7 +7546,7 @@ impl Service {
|
||||
let mut tenants_affected: usize = 0;
|
||||
|
||||
for (_tenant_id, mut schedule_context, shards) in
|
||||
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
|
||||
{
|
||||
for tenant_shard in shards {
|
||||
let tenant_shard_id = tenant_shard.tenant_shard_id;
|
||||
@@ -7885,142 +7717,6 @@ impl Service {
|
||||
self.node_configure(node_id, availability, scheduling).await
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_delete(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
) -> Result<(), ApiError> {
|
||||
let (ongoing_op, node_policy, schedulable_nodes_count) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
let schedulable_nodes_count = nodes
|
||||
.iter()
|
||||
.filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
|
||||
.count();
|
||||
|
||||
(
|
||||
locked
|
||||
.ongoing_operation
|
||||
.as_ref()
|
||||
.map(|ongoing| ongoing.operation),
|
||||
node.get_scheduling(),
|
||||
schedulable_nodes_count,
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(ongoing) = ongoing_op {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Background operation already ongoing for node: {ongoing}").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if schedulable_nodes_count == 0 {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"No other schedulable nodes to move shards".into(),
|
||||
));
|
||||
}
|
||||
|
||||
match node_policy {
|
||||
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
|
||||
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Deleting))
|
||||
.await?;
|
||||
|
||||
let cancel = self.cancel.child_token();
|
||||
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
let policy_on_start = node_policy;
|
||||
|
||||
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
|
||||
operation: Operation::Delete(Delete { node_id }),
|
||||
cancel: cancel.clone(),
|
||||
});
|
||||
|
||||
let span = tracing::info_span!(parent: None, "delete_node", %node_id);
|
||||
|
||||
tokio::task::spawn(
|
||||
{
|
||||
let service = self.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
let _gate_guard = gate_guard;
|
||||
|
||||
scopeguard::defer! {
|
||||
let prev = service.inner.write().unwrap().ongoing_operation.take();
|
||||
|
||||
if let Some(Operation::Delete(removed_delete)) = prev.map(|h| h.operation) {
|
||||
assert_eq!(removed_delete.node_id, node_id, "We always take the same operation");
|
||||
} else {
|
||||
panic!("We always remove the same operation")
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Delete background operation starting");
|
||||
let res = service
|
||||
.delete_node(node_id, policy_on_start, cancel)
|
||||
.await;
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
"Delete background operation completed successfully"
|
||||
);
|
||||
}
|
||||
Err(OperationError::Cancelled) => {
|
||||
tracing::info!("Delete background operation was cancelled");
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Delete background operation encountered: {err}"
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(span),
|
||||
);
|
||||
}
|
||||
NodeSchedulingPolicy::Deleting => {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Node {node_id} has delete in progress"
|
||||
)));
|
||||
}
|
||||
policy => {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} cannot be deleted due to {policy:?} policy").into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn cancel_node_delete(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
) -> Result<(), ApiError> {
|
||||
{
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
}
|
||||
|
||||
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
|
||||
if let Operation::Delete(delete) = op_handler.operation {
|
||||
if delete.node_id == node_id {
|
||||
tracing::info!("Cancelling background delete operation for node {node_id}");
|
||||
op_handler.cancel.cancel();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no delete in progress").into(),
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_drain(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
@@ -8597,7 +8293,7 @@ impl Service {
|
||||
// to ignore the utilisation component of the score.
|
||||
|
||||
for (_tenant_id, schedule_context, shards) in
|
||||
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative)
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
|
||||
{
|
||||
for shard in shards {
|
||||
if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
|
||||
@@ -9324,7 +9020,25 @@ impl Service {
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
|
||||
let mut tid_iter = create_shared_shard_iterator(self.clone());
|
||||
let mut tid_iter = TenantShardIterator::new({
|
||||
let service = self.clone();
|
||||
move |last_inspected_shard: Option<TenantShardId>| {
|
||||
let locked = &service.inner.read().unwrap();
|
||||
let tenants = &locked.tenants;
|
||||
let entry = match last_inspected_shard {
|
||||
Some(skip_past) => {
|
||||
// Skip to the last seen tenant shard id
|
||||
let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
|
||||
|
||||
// Skip past the last seen
|
||||
cursor.nth(1)
|
||||
}
|
||||
None => tenants.first_key_value(),
|
||||
};
|
||||
|
||||
entry.map(|(tid, _)| tid).copied()
|
||||
}
|
||||
});
|
||||
|
||||
while !tid_iter.finished() {
|
||||
if cancel.is_cancelled() {
|
||||
@@ -9344,11 +9058,7 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
operation_utils::validate_node_state(
|
||||
&node_id,
|
||||
self.inner.read().unwrap().nodes.clone(),
|
||||
NodeSchedulingPolicy::Draining,
|
||||
)?;
|
||||
drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
|
||||
|
||||
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
|
||||
let tid = match tid_iter.next() {
|
||||
@@ -9428,7 +9138,7 @@ impl Service {
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
|
||||
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
|
||||
.await;
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
|
||||
@@ -9722,7 +9432,7 @@ impl Service {
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
|
||||
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::TenantShardId;
|
||||
@@ -7,21 +6,16 @@ use utils::shard::TenantShardId;
|
||||
use crate::scheduler::{ScheduleContext, ScheduleMode};
|
||||
use crate::tenant_shard::TenantShard;
|
||||
|
||||
use super::Service;
|
||||
|
||||
/// Exclusive iterator over all tenant shards.
|
||||
/// It is used to iterate over consistent tenants state at specific point in time.
|
||||
///
|
||||
/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
|
||||
/// tenant while considering the individual shards within it. This iterator is a helper
|
||||
/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
|
||||
/// for the tenant.
|
||||
pub(super) struct TenantShardExclusiveIterator<'a> {
|
||||
pub(super) struct TenantShardContextIterator<'a> {
|
||||
schedule_mode: ScheduleMode,
|
||||
inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
|
||||
}
|
||||
|
||||
impl<'a> TenantShardExclusiveIterator<'a> {
|
||||
impl<'a> TenantShardContextIterator<'a> {
|
||||
pub(super) fn new(
|
||||
tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
|
||||
schedule_mode: ScheduleMode,
|
||||
@@ -33,7 +27,7 @@ impl<'a> TenantShardExclusiveIterator<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for TenantShardExclusiveIterator<'a> {
|
||||
impl<'a> Iterator for TenantShardContextIterator<'a> {
|
||||
type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -58,93 +52,13 @@ impl<'a> Iterator for TenantShardExclusiveIterator<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Shared iterator over all tenant shards.
|
||||
/// It is used to iterate over all tenants without blocking another code, working with tenants
|
||||
///
|
||||
/// A simple iterator which can be used in tandem with [`crate::service::Service`]
|
||||
/// to iterate over all known tenant shard ids without holding the lock on the
|
||||
/// service state at all times.
|
||||
pub(crate) struct TenantShardSharedIterator<F> {
|
||||
tenants_accessor: F,
|
||||
inspected_all_shards: bool,
|
||||
last_inspected_shard: Option<TenantShardId>,
|
||||
}
|
||||
|
||||
impl<F> TenantShardSharedIterator<F>
|
||||
where
|
||||
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
|
||||
{
|
||||
pub(crate) fn new(tenants_accessor: F) -> Self {
|
||||
Self {
|
||||
tenants_accessor,
|
||||
inspected_all_shards: false,
|
||||
last_inspected_shard: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn finished(&self) -> bool {
|
||||
self.inspected_all_shards
|
||||
}
|
||||
}
|
||||
|
||||
impl<F> Iterator for TenantShardSharedIterator<F>
|
||||
where
|
||||
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
|
||||
{
|
||||
// TODO(ephemeralsad): consider adding schedule context to the iterator
|
||||
type Item = TenantShardId;
|
||||
|
||||
/// Returns the next tenant shard id if one exists
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.inspected_all_shards {
|
||||
return None;
|
||||
}
|
||||
|
||||
match (self.tenants_accessor)(self.last_inspected_shard) {
|
||||
Some(tid) => {
|
||||
self.last_inspected_shard = Some(tid);
|
||||
Some(tid)
|
||||
}
|
||||
None => {
|
||||
self.inspected_all_shards = true;
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_shared_shard_iterator(
|
||||
service: Arc<Service>,
|
||||
) -> TenantShardSharedIterator<impl Fn(Option<TenantShardId>) -> Option<TenantShardId>> {
|
||||
let tenants_accessor = move |last_inspected_shard: Option<TenantShardId>| {
|
||||
let locked = &service.inner.read().unwrap();
|
||||
let tenants = &locked.tenants;
|
||||
let entry = match last_inspected_shard {
|
||||
Some(skip_past) => {
|
||||
// Skip to the last seen tenant shard id
|
||||
let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
|
||||
|
||||
// Skip past the last seen
|
||||
cursor.nth(1)
|
||||
}
|
||||
None => tenants.first_key_value(),
|
||||
};
|
||||
|
||||
entry.map(|(tid, _)| tid).copied()
|
||||
};
|
||||
|
||||
TenantShardSharedIterator::new(tenants_accessor)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use utils::shard::{ShardCount, ShardNumber};
|
||||
|
||||
use super::*;
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
@@ -152,7 +66,7 @@ mod tests {
|
||||
use crate::tenant_shard::tests::make_test_tenant_with_id;
|
||||
|
||||
#[test]
|
||||
fn test_exclusive_shard_iterator() {
|
||||
fn test_context_iterator() {
|
||||
// Hand-crafted tenant IDs to ensure they appear in the expected order when put into
|
||||
// a btreemap & iterated
|
||||
let mut t_1_shards = make_test_tenant_with_id(
|
||||
@@ -192,7 +106,7 @@ mod tests {
|
||||
shard.schedule(&mut scheduler, &mut context).unwrap();
|
||||
}
|
||||
|
||||
let mut iter = TenantShardExclusiveIterator::new(&mut tenants, ScheduleMode::Speculative);
|
||||
let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
|
||||
let (tenant_id, context, shards) = iter.next().unwrap();
|
||||
assert_eq!(tenant_id, t1_id);
|
||||
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
|
||||
@@ -218,46 +132,4 @@ mod tests {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shared_shard_iterator() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_count = ShardCount(8);
|
||||
|
||||
let mut tenant_shards = Vec::default();
|
||||
for i in 0..shard_count.0 {
|
||||
tenant_shards.push((
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(i),
|
||||
shard_count,
|
||||
},
|
||||
(),
|
||||
))
|
||||
}
|
||||
|
||||
let tenant_shards = Arc::new(tenant_shards);
|
||||
|
||||
let tid_iter = TenantShardSharedIterator::new({
|
||||
let tenants = tenant_shards.clone();
|
||||
move |last_inspected_shard: Option<TenantShardId>| {
|
||||
let entry = match last_inspected_shard {
|
||||
Some(skip_past) => {
|
||||
let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
|
||||
cursor.nth(1)
|
||||
}
|
||||
None => tenants.first(),
|
||||
};
|
||||
|
||||
entry.map(|(tid, _)| tid).copied()
|
||||
}
|
||||
});
|
||||
|
||||
let mut iterated_over = Vec::default();
|
||||
for tid in tid_iter {
|
||||
iterated_over.push((tid, ()));
|
||||
}
|
||||
|
||||
assert_eq!(iterated_over, *tenant_shards);
|
||||
}
|
||||
}
|
||||
@@ -914,13 +914,13 @@ impl Service {
|
||||
// so it isn't counted toward the quorum.
|
||||
if let Some(min_position) = min_position {
|
||||
if let Ok(ok_res) = &res {
|
||||
if (ok_res.last_log_term, ok_res.flush_lsn) < min_position {
|
||||
if (ok_res.term, ok_res.flush_lsn) < min_position {
|
||||
// Use Error::Timeout to make this error retriable.
|
||||
res = Err(mgmt_api::Error::Timeout(
|
||||
format!(
|
||||
"safekeeper {} returned position {:?} which is less than minimum required position {:?}",
|
||||
client.node_id_label(),
|
||||
(ok_res.last_log_term, ok_res.flush_lsn),
|
||||
(ok_res.term, ok_res.flush_lsn),
|
||||
min_position
|
||||
)
|
||||
));
|
||||
@@ -1216,7 +1216,7 @@ impl Service {
|
||||
|
||||
let mut sync_position = (INITIAL_TERM, Lsn::INVALID);
|
||||
for res in results.into_iter().flatten() {
|
||||
let sk_position = (res.last_log_term, res.flush_lsn);
|
||||
let sk_position = (res.term, res.flush_lsn);
|
||||
if sync_position < sk_position {
|
||||
sync_position = sk_position;
|
||||
}
|
||||
|
||||
@@ -57,8 +57,6 @@ class EndpointHttpClient(requests.Session):
|
||||
self.auth = BearerAuth(jwt)
|
||||
|
||||
self.mount("http://", HTTPAdapter())
|
||||
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
|
||||
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
|
||||
|
||||
def dbs_and_roles(self):
|
||||
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
|
||||
@@ -66,39 +64,33 @@ class EndpointHttpClient(requests.Session):
|
||||
return res.json()
|
||||
|
||||
def prewarm_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.prewarm_url)
|
||||
res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def prewarm_lfc(self, from_endpoint_id: str | None = None):
|
||||
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
|
||||
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
|
||||
self.post(self.prewarm_url, params=params).raise_for_status()
|
||||
self.prewarm_lfc_wait()
|
||||
self.post(url, params=params).raise_for_status()
|
||||
|
||||
def prewarm_lfc_wait(self):
|
||||
def prewarmed():
|
||||
json = self.prewarm_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
assert status == "completed", f"{status}, error {err}"
|
||||
|
||||
wait_until(prewarmed, timeout=60)
|
||||
|
||||
def offload_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.offload_url)
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def offload_lfc(self):
|
||||
self.post(self.offload_url).raise_for_status()
|
||||
self.offload_lfc_wait()
|
||||
url = f"http://localhost:{self.external_port}/lfc/offload"
|
||||
self.post(url).raise_for_status()
|
||||
|
||||
def offload_lfc_wait(self):
|
||||
def offloaded():
|
||||
json = self.offload_lfc_status()
|
||||
res = self.get(url)
|
||||
res.raise_for_status()
|
||||
json = res.json()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
assert status == "completed", f"{status}, error {err}"
|
||||
|
||||
wait_until(offloaded)
|
||||
|
||||
|
||||
@@ -568,8 +568,6 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
timeout: str | None = None,
|
||||
env: dict[str, str] | None = None,
|
||||
dev: bool = False,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> subprocess.CompletedProcess[str]:
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -595,10 +593,6 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
args.extend(["--create-test-user"])
|
||||
if timeout is not None:
|
||||
args.extend(["--start-timeout", str(timeout)])
|
||||
if autoprewarm:
|
||||
args.extend(["--autoprewarm"])
|
||||
if offload_lfc_interval_seconds is not None:
|
||||
args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
|
||||
if dev:
|
||||
args.extend(["--dev"])
|
||||
|
||||
|
||||
@@ -1867,7 +1867,6 @@ class PageserverSchedulingPolicy(StrEnum):
|
||||
FILLING = "Filling"
|
||||
PAUSE = "Pause"
|
||||
PAUSE_FOR_RESTART = "PauseForRestart"
|
||||
DELETING = "Deleting"
|
||||
|
||||
|
||||
class StorageControllerLeadershipStatus(StrEnum):
|
||||
@@ -2076,27 +2075,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_delete_old(self, node_id):
|
||||
log.info(f"node_delete_old({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_delete(self, node_id):
|
||||
log.info(f"node_delete({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/delete",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def cancel_node_delete(self, node_id):
|
||||
log.info(f"cancel_node_delete({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/delete",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -4362,8 +4345,6 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
basebackup_request_tries: int | None = None,
|
||||
timeout: str | None = None,
|
||||
env: dict[str, str] | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
@@ -4388,8 +4369,6 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
timeout=timeout,
|
||||
env=env,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
self._running.release(1)
|
||||
self.log_config_value("shared_buffers")
|
||||
@@ -4605,8 +4584,6 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple: bool = False,
|
||||
basebackup_request_tries: int | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -4627,8 +4604,6 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
|
||||
return self
|
||||
@@ -4713,8 +4688,6 @@ class EndpointFactory:
|
||||
remote_ext_base_url: str | None = None,
|
||||
pageserver_id: int | None = None,
|
||||
basebackup_request_tries: int | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -4736,8 +4709,6 @@ class EndpointFactory:
|
||||
remote_ext_base_url=remote_ext_base_url,
|
||||
pageserver_id=pageserver_id,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
|
||||
def create(
|
||||
|
||||
@@ -111,7 +111,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*stalling layer flushes for compaction backpressure.*",
|
||||
".*layer roll waiting for flush due to compaction backpressure.*",
|
||||
".*BatchSpanProcessor.*",
|
||||
".*No broker updates received for a while.*",
|
||||
*(
|
||||
[
|
||||
r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"
|
||||
|
||||
@@ -112,18 +112,12 @@ class TimelineCreateRequest:
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: MembershipConfiguration
|
||||
current_conf: MembershipConfiguration
|
||||
last_log_term: int
|
||||
flush_lsn: Lsn
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
|
||||
current_conf = MembershipConfiguration.from_json(d["current_conf"])
|
||||
last_log_term = d["last_log_term"]
|
||||
flush_lsn = Lsn(d["flush_lsn"])
|
||||
return TimelineMembershipSwitchResponse(
|
||||
previous_conf, current_conf, last_log_term, flush_lsn
|
||||
)
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
|
||||
@@ -55,10 +55,9 @@ def test_pageserver_characterize_throughput_with_n_tenants(
|
||||
@pytest.mark.parametrize("duration", [20 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
|
||||
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
|
||||
# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type,
|
||||
# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered
|
||||
# a good utilization for pageserver.
|
||||
@pytest.mark.parametrize("n_clients", [1, 8])
|
||||
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
|
||||
# which by default uses 64 connections
|
||||
@pytest.mark.parametrize("n_clients", [1, 64])
|
||||
@pytest.mark.parametrize("n_tenants", [1])
|
||||
@pytest.mark.timeout(2400)
|
||||
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
|
||||
|
||||
@@ -416,8 +416,6 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
|
||||
# timeline creation (uploads). mask it out here to avoid flakyness.
|
||||
del success_result["remote_consistent_lsn_visible"]
|
||||
del repeat_result["remote_consistent_lsn_visible"]
|
||||
del success_result["walreceiver_status"]
|
||||
del repeat_result["walreceiver_status"]
|
||||
assert repeat_result == success_result
|
||||
finally:
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -1,38 +1,34 @@
|
||||
import random
|
||||
import threading
|
||||
from enum import StrEnum
|
||||
from time import sleep
|
||||
from typing import Any
|
||||
import time
|
||||
from enum import Enum
|
||||
|
||||
import pytest
|
||||
from fixtures.endpoint.http import EndpointHttpClient
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import USE_LFC, wait_until
|
||||
from fixtures.utils import USE_LFC
|
||||
from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
|
||||
from psycopg2.extensions import cursor as Cursor
|
||||
|
||||
|
||||
class PrewarmMethod(StrEnum):
|
||||
POSTGRES = "postgres"
|
||||
COMPUTE_CTL = "compute-ctl"
|
||||
AUTOPREWARM = "autoprewarm"
|
||||
class LfcQueryMethod(Enum):
|
||||
COMPUTE_CTL = False
|
||||
POSTGRES = True
|
||||
|
||||
|
||||
PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
|
||||
OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
|
||||
METHOD_VALUES = [e for e in PrewarmMethod]
|
||||
METHOD_IDS = [e.value for e in PrewarmMethod]
|
||||
PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
|
||||
OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
|
||||
QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
|
||||
|
||||
|
||||
def check_pinned_entries(cur: Cursor):
|
||||
def check_pinned_entries(cur):
|
||||
# some LFC buffer can be temporary locked by autovacuum or background writer
|
||||
for _ in range(10):
|
||||
cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
|
||||
n_pinned = cur.fetchall()[0][0]
|
||||
if n_pinned == 0:
|
||||
break
|
||||
sleep(1)
|
||||
time.sleep(1)
|
||||
assert n_pinned == 0
|
||||
|
||||
|
||||
@@ -45,68 +41,21 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
|
||||
}
|
||||
|
||||
|
||||
def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
client.offload_lfc_wait()
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
status = client.prewarm_lfc_status()
|
||||
assert status["status"] == "not_prewarmed"
|
||||
assert "error" not in status
|
||||
client.offload_lfc()
|
||||
assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
|
||||
assert prom_parse(client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
|
||||
elif method == PrewarmMethod.POSTGRES:
|
||||
cur.execute("select get_local_cache_state()")
|
||||
return cur.fetchall()[0][0]
|
||||
else:
|
||||
raise AssertionError(f"{method} not in PrewarmMethod")
|
||||
|
||||
|
||||
def prewarm_endpoint(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
client.prewarm_lfc_wait()
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
client.prewarm_lfc()
|
||||
elif method == PrewarmMethod.POSTGRES:
|
||||
cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
|
||||
def check_prewarmed(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
assert prom_parse(client)[PREWARM_LABEL] == 1
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
assert prom_parse(client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
|
||||
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
|
||||
def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
|
||||
env = neon_simple_env
|
||||
n_records = 1000000
|
||||
cfg = [
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000",
|
||||
]
|
||||
offload_secs = 2
|
||||
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=cfg,
|
||||
autoprewarm=True,
|
||||
offload_lfc_interval_seconds=offload_secs,
|
||||
)
|
||||
else:
|
||||
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=[
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000",
|
||||
],
|
||||
)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
@@ -120,21 +69,31 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
|
||||
log.info(f"Inserted {n_records} rows")
|
||||
|
||||
client = endpoint.http_client()
|
||||
lfc_state = offload_lfc(method, client, pg_cur)
|
||||
http_client = endpoint.http_client()
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
status = http_client.prewarm_lfc_status()
|
||||
assert status["status"] == "not_prewarmed"
|
||||
assert "error" not in status
|
||||
http_client.offload_lfc()
|
||||
assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
|
||||
else:
|
||||
pg_cur.execute("select get_local_cache_state()")
|
||||
lfc_state = pg_cur.fetchall()[0][0]
|
||||
|
||||
endpoint.stop()
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
|
||||
else:
|
||||
endpoint.start()
|
||||
endpoint.start()
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
|
||||
lfc_conn = endpoint.connect(dbname="lfc")
|
||||
lfc_cur = lfc_conn.cursor()
|
||||
prewarm_endpoint(method, client, pg_cur, lfc_state)
|
||||
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
http_client.prewarm_lfc()
|
||||
else:
|
||||
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
|
||||
lfc_used_pages = pg_cur.fetchall()[0][0]
|
||||
@@ -152,32 +111,33 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
and prewarm_info[1] > 0
|
||||
and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
|
||||
)
|
||||
|
||||
lfc_cur.execute("select sum(pk) from t")
|
||||
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
|
||||
|
||||
check_pinned_entries(pg_cur)
|
||||
|
||||
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
|
||||
check_prewarmed(method, client, desired)
|
||||
|
||||
|
||||
# autoprewarm isn't needed as we prewarm manually
|
||||
WORKLOAD_VALUES = METHOD_VALUES[:-1]
|
||||
WORKLOAD_IDS = METHOD_IDS[:-1]
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
assert http_client.prewarm_lfc_status() == desired
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
|
||||
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
|
||||
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
|
||||
env = neon_simple_env
|
||||
n_records = 10000
|
||||
n_threads = 4
|
||||
cfg = [
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000000",
|
||||
]
|
||||
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=[
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000000",
|
||||
],
|
||||
)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
@@ -194,7 +154,12 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
log.info(f"Inserted {n_records} rows")
|
||||
|
||||
http_client = endpoint.http_client()
|
||||
lfc_state = offload_lfc(method, http_client, pg_cur)
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
http_client.offload_lfc()
|
||||
else:
|
||||
pg_cur.execute("select get_local_cache_state()")
|
||||
lfc_state = pg_cur.fetchall()[0][0]
|
||||
|
||||
running = True
|
||||
n_prewarms = 0
|
||||
|
||||
@@ -205,8 +170,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
while running:
|
||||
src = random.randint(1, n_records)
|
||||
dst = random.randint(1, n_records)
|
||||
lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
|
||||
lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
|
||||
lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
|
||||
lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
|
||||
n_transfers += 1
|
||||
log.info(f"Number of transfers: {n_transfers}")
|
||||
|
||||
@@ -218,7 +183,13 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
pg_cur.execute("select pg_reload_conf()")
|
||||
pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
|
||||
pg_cur.execute("select pg_reload_conf()")
|
||||
prewarm_endpoint(method, http_client, pg_cur, lfc_state)
|
||||
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
# Same thing as prewarm_lfc(), testing other method
|
||||
http_client.prewarm_lfc(endpoint.endpoint_id)
|
||||
else:
|
||||
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
nonlocal n_prewarms
|
||||
n_prewarms += 1
|
||||
log.info(f"Number of prewarms: {n_prewarms}")
|
||||
@@ -232,10 +203,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
prewarm_thread = threading.Thread(target=prewarm)
|
||||
prewarm_thread.start()
|
||||
|
||||
def prewarmed():
|
||||
assert n_prewarms > 5
|
||||
|
||||
wait_until(prewarmed)
|
||||
time.sleep(20)
|
||||
|
||||
running = False
|
||||
for t in workload_threads:
|
||||
@@ -247,5 +215,5 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
assert total_balance == 0
|
||||
|
||||
check_pinned_entries(pg_cur)
|
||||
if method != PrewarmMethod.POSTGRES:
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}
|
||||
|
||||
@@ -2618,7 +2618,7 @@ def test_storage_controller_node_deletion(
|
||||
wait_until(assert_shards_migrated)
|
||||
|
||||
log.info(f"Deleting pageserver {victim.id}")
|
||||
env.storage_controller.node_delete_old(victim.id)
|
||||
env.storage_controller.node_delete(victim.id)
|
||||
|
||||
if not while_offline:
|
||||
|
||||
@@ -2653,60 +2653,6 @@ def test_storage_controller_node_deletion(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.num_azs = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_count = 12
|
||||
shard_count_per_tenant = 16
|
||||
tenant_ids = []
|
||||
|
||||
for _ in range(0, tenant_count):
|
||||
tid = TenantId.generate()
|
||||
tenant_ids.append(tid)
|
||||
env.create_tenant(
|
||||
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
|
||||
)
|
||||
|
||||
# Sanity check: initial creations should not leave the system in an unstable scheduling state
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 3
|
||||
|
||||
env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
|
||||
|
||||
ps_id_to_delete = env.pageservers[0].id
|
||||
|
||||
env.storage_controller.warm_up_all_secondaries()
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_delete(ps_id),
|
||||
ps_id_to_delete,
|
||||
max_attempts=3,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
ps_id_to_delete,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.DELETING,
|
||||
max_attempts=6,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
env.storage_controller.cancel_node_delete(ps_id_to_delete)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
ps_id_to_delete,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.ACTIVE,
|
||||
max_attempts=6,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [None, 2])
|
||||
def test_storage_controller_metadata_health(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -3262,7 +3208,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
|
||||
assert_nodes_count(3)
|
||||
|
||||
ps = env.pageservers[0]
|
||||
env.storage_controller.node_delete_old(ps.id)
|
||||
env.storage_controller.node_delete(ps.id)
|
||||
|
||||
# After deletion, the node count must be reduced
|
||||
assert_nodes_count(2)
|
||||
|
||||
@@ -13,6 +13,50 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
|
||||
|
||||
# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
|
||||
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
|
||||
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
||||
# we assert below that the walreceiver is not active before data writes.
|
||||
# with manually created timelines, it is active.
|
||||
# FIXME: remove this test once we remove timelines_onto_safekeepers
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": False,
|
||||
}
|
||||
|
||||
# Trigger WAL wait timeout faster
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.http_client()
|
||||
|
||||
# In this test we force 'Timed out while waiting for WAL record error' while
|
||||
# fetching basebackup and don't want any retries.
|
||||
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
|
||||
|
||||
tenant_id, timeline_id = env.create_tenant()
|
||||
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
||||
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
||||
|
||||
try:
|
||||
trigger_wait_lsn_timeout(env, tenant_id)
|
||||
except Exception as e:
|
||||
exception_string = str(e)
|
||||
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||
assert "WalReceiver status: Not active" in exception_string, (
|
||||
"Walreceiver should not be active before any data writes"
|
||||
)
|
||||
|
||||
insert_test_elements(env, tenant_id, start=0, count=1_000)
|
||||
try:
|
||||
trigger_wait_lsn_timeout(env, tenant_id)
|
||||
except Exception as e:
|
||||
exception_string = str(e)
|
||||
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||
assert "WalReceiver status: Not active" not in exception_string, (
|
||||
"Should not be inactive anymore after INSERTs are made"
|
||||
)
|
||||
assert "WalReceiver status" in exception_string, "But still should have some other status"
|
||||
|
||||
|
||||
# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
|
||||
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
|
||||
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
i numeric;
|
||||
BEGIN
|
||||
create role somebody;
|
||||
FOR i IN 1..1000000 LOOP
|
||||
BEGIN
|
||||
IF i % 1000 = 0 THEN
|
||||
alter role somebody password 'welcome';
|
||||
ELSE
|
||||
PERFORM 1;
|
||||
END IF;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE WARNING 'error';
|
||||
END;
|
||||
IF I = 1000000 THEN
|
||||
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
|
||||
END IF;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
@@ -10,4 +10,3 @@ test: neon-clog
|
||||
test: neon-test-utils
|
||||
test: neon-vacuum-full
|
||||
test: neon-event-triggers
|
||||
test: neon-subxacts
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
i numeric;
|
||||
BEGIN
|
||||
create role somebody;
|
||||
FOR i IN 1..1000000 LOOP
|
||||
BEGIN
|
||||
IF i % 1000 = 0 THEN
|
||||
alter role somebody password 'welcome';
|
||||
ELSE
|
||||
PERFORM 1;
|
||||
END IF;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE WARNING 'error';
|
||||
END;
|
||||
IF I = 1000000 THEN
|
||||
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
|
||||
END IF;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
@@ -40,10 +40,8 @@ env_logger = { version = "0.11" }
|
||||
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
|
||||
form_urlencoded = { version = "1" }
|
||||
futures-channel = { version = "0.3", features = ["sink"] }
|
||||
futures-core = { version = "0.3" }
|
||||
futures-executor = { version = "0.3" }
|
||||
futures-io = { version = "0.3" }
|
||||
futures-sink = { version = "0.3" }
|
||||
futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
|
||||
generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
@@ -70,7 +68,6 @@ num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
|
||||
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
once_cell = { version = "1" }
|
||||
p256 = { version = "0.13", features = ["jwk"] }
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
|
||||
@@ -115,13 +112,10 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
|
||||
|
||||
[build-dependencies]
|
||||
ahash = { version = "0.8" }
|
||||
anstream = { version = "0.6" }
|
||||
anyhow = { version = "1", features = ["backtrace"] }
|
||||
bytes = { version = "1", features = ["serde"] }
|
||||
cc = { version = "1", default-features = false, features = ["parallel"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
|
||||
clap = { version = "4", features = ["derive", "env", "string"] }
|
||||
clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
|
||||
either = { version = "1" }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
half = { version = "2", default-features = false, features = ["num-traits"] }
|
||||
@@ -139,7 +133,6 @@ num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
|
||||
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
once_cell = { version = "1" }
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
|
||||
proc-macro2 = { version = "1" }
|
||||
@@ -149,7 +142,6 @@ regex = { version = "1" }
|
||||
regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
|
||||
regex-syntax = { version = "0.8" }
|
||||
serde = { version = "1", features = ["alloc", "derive"] }
|
||||
serde_json = { version = "1", features = ["alloc", "raw_value"] }
|
||||
syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
|
||||
time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
|
||||
toml_edit = { version = "0.22", features = ["serde"] }
|
||||
|
||||
Reference in New Issue
Block a user