Compare commits

..

4 Commits

Author SHA1 Message Date
Vlad Lazar
96bd74697a storage_controller: remove node activation reconciliation 2025-01-07 17:32:53 +01:00
Vlad Lazar
8e77e10aa3 storage_controller: don't drop observed state on unavailable detach
The observed state removal may race with the inline updates of the
observed state done from `Service::node_activate_reconcile`.

This was intended to work as follows:
1. Detaches while the node is unavailable remove the entry from the
   observed state.
2. `Service::node_activate_reconcile` diffs the locations returned
   by the pageserver with the observed state and detaches in-line
   when required.

This commit removes step (1) and lets background reconciliations
deal with the mismatch between the intent and observed state.
2025-01-07 17:04:47 +01:00
Vlad Lazar
83e90d3b65 tests: add repro for node flap detach race 2025-01-07 16:39:24 +01:00
Vlad Lazar
d3fa0f6b9e storage_controller: rename failpoint and make it pausable
The same failpoint is used for a new test by a follow up commit
and that needs a pausable failpoint.
2025-01-07 16:30:32 +01:00
178 changed files with 3788 additions and 8073 deletions

View File

@@ -1,12 +0,0 @@
rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**']
v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**']
rebuild_neon_extra:
- .github/workflows/neon_extra_builds.yml
rebuild_macos:
- .github/workflows/build-macos.yml

View File

@@ -1,241 +0,0 @@
name: Check neon with MacOS builds
on:
workflow_call:
inputs:
pg_versions:
description: "Array of the pg versions to build for, for example: ['v14', 'v17']"
type: string
default: '[]'
required: false
rebuild_rust_code:
description: "Rebuild Rust code"
type: boolean
default: false
required: false
rebuild_everything:
description: "If true, rebuild for all versions"
type: boolean
default: false
required: false
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow
# We should care about that as Github has limitations:
# - You can connect up to four levels of workflows
# - You can call a maximum of 20 unique reusable workflows from a single workflow file.
# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations
jobs:
build-pgxn:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
strategy:
matrix:
postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
- name: Set pg ${{ matrix.postgres-version }} for caching
id: pg_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres ${{ matrix.postgres-version }} build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/${{ matrix.postgres-version }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
git submodule init vendor/postgres-${{ matrix.postgres-version }}
git submodule update --depth 1 --recursive
- name: Install build dependencies
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build Postgres ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
- name: Build Neon Pg Ext ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
- name: Get postgres headers ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
build-walproposer-lib:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
needs: [build-pgxn]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
- name: Set pg v17 for caching
id: pg_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres v17 build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache walproposer-lib
id: cache_walproposer_lib
uses: actions/cache@v4
with:
path: pg_install/build/walproposer-lib
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Checkout submodule vendor/postgres-v17
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
git submodule init vendor/postgres-v17
git submodule update --depth 1 --recursive
- name: Install build dependencies
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build walproposer-lib (only for v17)
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run:
make walproposer-lib -j$(sysctl -n hw.ncpu)
cargo-build:
if: |
(inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
timeout-minutes: 30
runs-on: macos-15
needs: [build-pgxn, build-walproposer-lib]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout main repo
uses: actions/checkout@v4
with:
submodules: true
- name: Set pg v14 for caching
id: pg_rev_v14
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v15 for caching
id: pg_rev_v15
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v16 for caching
id: pg_rev_v16
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}"
- name: Set pg v17 for caching
id: pg_rev_v17
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
- name: Cache postgres v14 build
id: cache_pg
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_v15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_v16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_v17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache cargo deps (only for v17)
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Cache walproposer-lib
id: cache_walproposer_lib
uses: actions/cache@v4
with:
path: pg_install/build/walproposer-lib
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Install build dependencies
run: |
brew install flex bison openssl protobuf icu4c
- name: Set extra env for macOS
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Run cargo build (only for v17)
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
- name: Check that no warnings are produced (only for v17)
run: ./run_clippy.sh

View File

@@ -728,6 +728,30 @@ jobs:
tags: |
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
# We pick 16, because that builds on debian 11 with older glibc (and is
# thus compatible with newer glibc), rather than 17 on Debian 12, as
# that isn't guaranteed to be compatible with Debian 11
if: matrix.version.pg == 'v16'
uses: docker/build-push-action@v6
with:
target: compute-tools-image
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
DEBIAN_VERSION=${{ matrix.version.debian }}
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
compute-node-image:
needs: [ compute-node-image-arch, tag ]
permissions:
@@ -770,6 +794,14 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version.pg == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
@@ -785,6 +817,12 @@ jobs:
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version.pg == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
@@ -941,16 +979,6 @@ jobs:
VERSIONS: v14 v15 v16 v17
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -963,6 +991,9 @@ jobs:
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
docker buildx imagetools create -t $repo/compute-tools:latest \
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -991,7 +1022,7 @@ jobs:
- name: Copy all images to prod ECR
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
run: |
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
@@ -1003,7 +1034,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1015,7 +1046,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}

View File

@@ -31,15 +31,19 @@ jobs:
uses: ./.github/workflows/build-build-tools-image.yml
secrets: inherit
files-changed:
name: Detect what files changed
runs-on: ubuntu-22.04
timeout-minutes: 3
outputs:
v17: ${{ steps.files_changed.outputs.v17 }}
postgres_changes: ${{ steps.postgres_changes.outputs.changes }}
rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }}
rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }}
check-macos-build:
needs: [ check-permissions ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
timeout-minutes: 90
runs-on: macos-15
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout
@@ -47,45 +51,106 @@ jobs:
with:
submodules: true
- name: Check for Postgres changes
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
id: files_changed
- name: Install macOS postgres dependencies
run: brew install flex bison openssl protobuf icu4c
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set pg 17 revision for caching
id: pg_v17_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
token: ${{ github.token }}
filters: .github/file-filters.yaml
base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }}
ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }}
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Filter out only v-string for build matrix
id: postgres_changes
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |
v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
check-macos-build:
needs: [ check-permissions, files-changed ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
uses: ./.github/workflows/build-macos.yml
with:
pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
- name: Cache cargo deps
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: make postgres-v14 -j$(sysctl -n hw.ncpu)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: make postgres-v15 -j$(sysctl -n hw.ncpu)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: make postgres-v17 -j$(sysctl -n hw.ncpu)
- name: Build neon extensions
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
- name: Build walproposer-lib
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
- name: Run cargo build
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
- name: Check that no warnings are produced
run: ./run_clippy.sh
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image, files-changed ]
needs: [ check-permissions, build-build-tools-image ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: write
if: |
(needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && (
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
)
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm

275
Cargo.lock generated
View File

@@ -718,13 +718,13 @@ dependencies = [
[[package]]
name = "axum"
version = "0.7.9"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
dependencies = [
"async-trait",
"axum-core",
"base64 0.22.1",
"base64 0.21.1",
"bytes",
"futures-util",
"http 1.1.0",
@@ -746,8 +746,8 @@ dependencies = [
"sha1",
"sync_wrapper 1.0.1",
"tokio",
"tokio-tungstenite 0.24.0",
"tower 0.5.2",
"tokio-tungstenite",
"tower",
"tower-layer",
"tower-service",
"tracing",
@@ -1267,7 +1267,6 @@ dependencies = [
"aws-config",
"aws-sdk-kms",
"aws-sdk-s3",
"axum",
"base64 0.13.1",
"bytes",
"camino",
@@ -1278,7 +1277,7 @@ dependencies = [
"fail",
"flate2",
"futures",
"http 1.1.0",
"hyper 0.14.30",
"metrics",
"nix 0.27.1",
"notify",
@@ -1304,8 +1303,6 @@ dependencies = [
"tokio-postgres",
"tokio-stream",
"tokio-util",
"tower 0.5.2",
"tower-http",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -1605,32 +1602,6 @@ dependencies = [
"typenum",
]
[[package]]
name = "curve25519-dalek"
version = "4.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
dependencies = [
"cfg-if",
"cpufeatures",
"curve25519-dalek-derive",
"digest",
"fiat-crypto",
"rustc_version",
"subtle",
]
[[package]]
name = "curve25519-dalek-derive"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.90",
]
[[package]]
name = "darling"
version = "0.20.1"
@@ -1679,20 +1650,6 @@ dependencies = [
"parking_lot_core 0.9.8",
]
[[package]]
name = "dashmap"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
]
[[package]]
name = "data-encoding"
version = "2.4.0"
@@ -1901,28 +1858,6 @@ dependencies = [
"spki 0.7.3",
]
[[package]]
name = "ed25519"
version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
dependencies = [
"signature 2.2.0",
]
[[package]]
name = "ed25519-dalek"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
dependencies = [
"curve25519-dalek",
"ed25519",
"rand_core 0.6.4",
"sha2",
"subtle",
]
[[package]]
name = "either"
version = "1.8.1"
@@ -2014,15 +1949,6 @@ dependencies = [
"syn 2.0.90",
]
[[package]]
name = "env_filter"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
dependencies = [
"log",
]
[[package]]
name = "env_logger"
version = "0.10.2"
@@ -2036,16 +1962,6 @@ dependencies = [
"termcolor",
]
[[package]]
name = "env_logger"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
dependencies = [
"env_filter",
"log",
]
[[package]]
name = "equator"
version = "0.2.2"
@@ -2161,12 +2077,6 @@ dependencies = [
"subtle",
]
[[package]]
name = "fiat-crypto"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
[[package]]
name = "filetime"
version = "0.2.22"
@@ -2810,7 +2720,7 @@ dependencies = [
"pin-project-lite",
"socket2",
"tokio",
"tower 0.4.13",
"tower",
"tower-service",
"tracing",
]
@@ -3035,28 +2945,6 @@ dependencies = [
"str_stack",
]
[[package]]
name = "inferno"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
dependencies = [
"ahash",
"clap",
"crossbeam-channel",
"crossbeam-utils",
"dashmap 6.1.0",
"env_logger 0.11.2",
"indexmap 2.0.1",
"itoa",
"log",
"num-format",
"once_cell",
"quick-xml 0.37.1",
"rgb",
"str_stack",
]
[[package]]
name = "inotify"
version = "0.9.6"
@@ -3264,7 +3152,7 @@ version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
dependencies = [
"dashmap 5.5.0",
"dashmap",
"hashbrown 0.13.2",
]
@@ -3372,9 +3260,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
[[package]]
name = "matchit"
version = "0.8.4"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
[[package]]
name = "md-5"
@@ -3802,23 +3690,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "opentelemetry"
version = "0.27.1"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.27.0"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
dependencies = [
"async-trait",
"bytes",
@@ -3829,9 +3717,9 @@ dependencies = [
[[package]]
name = "opentelemetry-otlp"
version = "0.27.0"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
dependencies = [
"async-trait",
"futures-core",
@@ -3847,9 +3735,9 @@ dependencies = [
[[package]]
name = "opentelemetry-proto"
version = "0.27.0"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
@@ -3859,21 +3747,22 @@ dependencies = [
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.27.0"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
[[package]]
name = "opentelemetry_sdk"
version = "0.27.1"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
dependencies = [
"async-trait",
"futures-channel",
"futures-executor",
"futures-util",
"glob",
"once_cell",
"opentelemetry",
"percent-encoding",
"rand 0.8.5",
@@ -3881,7 +3770,6 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
@@ -4478,8 +4366,6 @@ dependencies = [
"bytes",
"fallible-iterator",
"postgres-protocol",
"serde",
"serde_json",
]
[[package]]
@@ -4532,7 +4418,7 @@ dependencies = [
"bytes",
"crc32c",
"criterion",
"env_logger 0.10.2",
"env_logger",
"log",
"memoffset 0.9.0",
"once_cell",
@@ -4573,7 +4459,7 @@ dependencies = [
"cfg-if",
"criterion",
"findshlibs",
"inferno 0.11.21",
"inferno",
"libc",
"log",
"nix 0.26.4",
@@ -4799,10 +4685,9 @@ dependencies = [
"clap",
"compute_api",
"consumption_metrics",
"dashmap 5.5.0",
"dashmap",
"ecdsa 0.16.9",
"ed25519-dalek",
"env_logger 0.10.2",
"env_logger",
"fallible-iterator",
"flate2",
"framed-websockets",
@@ -4873,7 +4758,7 @@ dependencies = [
"tokio-postgres",
"tokio-postgres2",
"tokio-rustls 0.26.0",
"tokio-tungstenite 0.21.0",
"tokio-tungstenite",
"tokio-util",
"tracing",
"tracing-subscriber",
@@ -4909,15 +4794,6 @@ dependencies = [
"serde",
]
[[package]]
name = "quick-xml"
version = "0.37.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.37"
@@ -5302,15 +5178,15 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.5.5"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
dependencies = [
"anyhow",
"async-trait",
"getrandom 0.2.11",
"http 1.1.0",
"matchit 0.8.4",
"matchit 0.8.2",
"opentelemetry",
"reqwest",
"reqwest-middleware",
@@ -6924,19 +6800,7 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
]
[[package]]
name = "tokio-tungstenite"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.24.0",
"tungstenite",
]
[[package]]
@@ -7017,7 +6881,7 @@ dependencies = [
"tokio",
"tokio-rustls 0.26.0",
"tokio-stream",
"tower 0.4.13",
"tower",
"tower-layer",
"tower-service",
"tracing",
@@ -7057,50 +6921,17 @@ dependencies = [
"tracing",
]
[[package]]
name = "tower"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
dependencies = [
"futures-core",
"futures-util",
"pin-project-lite",
"sync_wrapper 1.0.1",
"tokio",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-http"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
dependencies = [
"bitflags 2.4.1",
"bytes",
"http 1.1.0",
"http-body 1.0.0",
"pin-project-lite",
"tower-layer",
"tower-service",
"tracing",
"uuid",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
[[package]]
name = "tower-service"
version = "0.3.3"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
[[package]]
name = "tracing"
@@ -7169,9 +7000,9 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.28.0"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
dependencies = [
"js-sys",
"once_cell",
@@ -7255,24 +7086,6 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -7440,7 +7253,6 @@ dependencies = [
"hex-literal",
"humantime",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"jsonwebtoken",
@@ -7544,7 +7356,7 @@ dependencies = [
"anyhow",
"camino-tempfile",
"clap",
"env_logger 0.10.2",
"env_logger",
"log",
"postgres",
"postgres_ffi",
@@ -8024,7 +7836,6 @@ dependencies = [
"num-traits",
"once_cell",
"parquet",
"postgres-types",
"prettyplease",
"proc-macro2",
"prost",
@@ -8051,14 +7862,12 @@ dependencies = [
"time",
"time-macros",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.0",
"tokio-stream",
"tokio-util",
"toml_edit",
"tonic",
"tower 0.4.13",
"tower 0.5.2",
"tower",
"tracing",
"tracing-core",
"url",

View File

@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.7.9", features = ["ws"] }
axum = { version = "0.7.5", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.70"
@@ -110,7 +110,6 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
@@ -127,10 +126,10 @@ notify = "6.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.27"
opentelemetry_sdk = "0.27"
opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.27"
opentelemetry = "0.26"
opentelemetry_sdk = "0.26"
opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.26"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
@@ -144,7 +143,7 @@ rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
reqwest-middleware = "0.4"
reqwest-retry = "0.7"
routerify = "3"
@@ -188,12 +187,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
tower-service = "0.3.3"
tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2"
tracing-opentelemetry = "0.28"
tracing-opentelemetry = "0.27"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
twox-hash = { version = "1.6.3", default-features = false }

View File

@@ -71,7 +71,6 @@ RUN set -e \
ca-certificates \
# System postgres for use with client libraries (e.g. in storage controller)
postgresql-15 \
openssl \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& useradd -d /data neon \
&& chown -R neon:neon /data
@@ -104,6 +103,11 @@ RUN mkdir -p /data/.neon/ && \
> /data/.neon/pageserver.toml && \
chown -R neon:neon /data/.neon
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
VOLUME ["/data"]
USER neon
EXPOSE 6400

View File

@@ -3,6 +3,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# Where to install Postgres, default is ./pg_install, maybe useful for package managers
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
OPENSSL_PREFIX_DIR := /usr/local/openssl
ICU_PREFIX_DIR := /usr/local/icu
#
@@ -25,9 +26,11 @@ endif
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
# Exclude static build openssl, icu for local build (MacOS, Linux)
# Only keep for build type release and debug
PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
PG_CONFIGURE_OPTS += --with-icu
PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
endif
UNAME_S := $(shell uname -s)

View File

@@ -115,7 +115,7 @@ RUN set -e \
# Keep the version the same as in compute/compute-node.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
ENV SQL_EXPORTER_VERSION=0.17.0
ENV SQL_EXPORTER_VERSION=0.16.0
RUN curl -fsSL \
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
--output sql_exporter.tar.gz \
@@ -190,6 +190,21 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
&& make install \
&& rm -rf ../lcov.tar.gz
# Compile and install the static OpenSSL library
ENV OPENSSL_VERSION=1.1.1w
ENV OPENSSL_PREFIX=/usr/local/openssl
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
cd /tmp && \
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
cd /tmp/openssl-${OPENSSL_VERSION} && \
./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \
make -j "$(nproc)" && \
make install && \
cd /tmp && \
rm -rf /tmp/openssl-${OPENSSL_VERSION}
# Use the same version of libicu as the compute nodes so that
# clusters created using inidb on pageserver can be used by computes.
#
@@ -243,7 +258,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.84.0
ENV RUSTC_VERSION=1.83.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1

View File

@@ -170,6 +170,7 @@ RUN case "${PG_VERSION}" in \
wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -219,7 +220,11 @@ RUN case "${PG_VERSION}" in \
cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
ninja -j $(getconf _NPROCESSORS_ONLN) && \
ninja -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
#########################################################################################
#
@@ -837,8 +842,13 @@ RUN case "${PG_VERSION}" in "v17") \
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
#########################################################################################
#
@@ -966,9 +976,22 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
ARG PG_VERSION
# version 0.3.3 supports v17
# last release v0.3.3 - Oct 16, 2024
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_JSONSCHEMA_VERSION=0.3.3 \
export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
# `unsafe-postgres` feature allows to build pgx extensions
@@ -989,9 +1012,22 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
ARG PG_VERSION
# version 1.5.9 supports v17
# last release v1.5.9 - Oct 16, 2024
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_GRAPHQL_VERSION=1.5.9 \
export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
@@ -1055,8 +1091,8 @@ ARG PG_VERSION
# NOTE: local_proxy depends on the version of pg_session_jwt
# Do not update without approve from proxy team
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release
@@ -1131,13 +1167,22 @@ FROM rust-extensions-build AS pg-mooncake-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# The topmost commit in the `neon` branch at the time of writing this
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
make release -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
RUN case "${PG_VERSION}" in \
'v14') \
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
esac && \
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
cd pg_mooncake-src && \
git checkout "${PG_MOONCAKE_VERSION}" && \
git submodule update --init --depth 1 --recursive && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
#########################################################################################
@@ -1222,6 +1267,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
-s install && \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16" | "v17") \
echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
@@ -1238,6 +1297,17 @@ USER nonroot
COPY --chown=nonroot . .
RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
#########################################################################################
#
# Final compute-tools image
#
#########################################################################################
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
#########################################################################################
#
# Layer "pgbouncer"
@@ -1274,11 +1344,11 @@ RUN set -e \
#
#########################################################################################
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
# Keep the version the same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
#########################################################################################
#

View File

@@ -15,7 +15,6 @@ aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-sdk-kms.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -23,7 +22,7 @@ clap.workspace = true
fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
hyper0 = { workspace = true, features = ["full"] }
metrics.workspace = true
nix.workspace = true
notify.workspace = true
@@ -38,8 +37,6 @@ serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
tower.workspace = true
tower-http.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true

View File

@@ -60,7 +60,7 @@ use compute_tools::compute::{
};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::launch_http_server;
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
@@ -111,6 +111,11 @@ fn main() -> Result<()> {
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
opentelemetry::global::set_error_handler(|err| {
tracing::info!("OpenTelemetry error: {err}");
})
.expect("global error handler lock poisoned");
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
thread::spawn(move || {
for sig in signals.forever() {
@@ -488,10 +493,7 @@ fn start_postgres(
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
@@ -589,8 +591,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");

View File

@@ -17,7 +17,7 @@
//!
//! # Local Testing
//!
//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
//! - Build the image with the following command:
//!
//! ```bash

View File

@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat
#[derive(Debug, thiserror::Error)]
pub enum SchemaDumpError {
#[error("database does not exist")]
#[error("Database does not exist.")]
DatabaseDoesNotExist,
#[error("failed to execute pg_dump")]
#[error("Failed to execute pg_dump.")]
IO(#[from] std::io::Error),
#[error("unexpected I/O error")]
#[error("Unexpected error.")]
Unexpected,
}

View File

@@ -15,7 +15,7 @@ use std::time::Instant;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use compute_api::spec::{Database, PgIdent, Role};
use compute_api::spec::{PgIdent, Role};
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
@@ -45,10 +45,8 @@ use crate::spec_apply::ApplySpecPhase::{
DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase;
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
HandleAnonExtension,
ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
};
use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -836,7 +834,7 @@ impl ComputeNode {
conf
}
pub async fn get_maintenance_client(
async fn get_maintenance_client(
conf: &tokio_postgres::Config,
) -> Result<tokio_postgres::Client> {
let mut conf = conf.clone();
@@ -945,78 +943,6 @@ impl ComputeNode {
dbs: databases,
}));
// Apply special pre drop database phase.
// NOTE: we use the code of RunInEachDatabase phase for parallelism
// and connection management, but we don't really run it in *each* database,
// only in databases, we're about to drop.
info!("Applying PerDatabase (pre-dropdb) phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
// Run the phase for each database that we're about to drop.
let db_processes = spec
.delta_operations
.iter()
.flatten()
.filter_map(move |op| {
if op.action.as_str() == "delete_db" {
Some(op.name.clone())
} else {
None
}
})
.map(|dbname| {
let spec = spec.clone();
let ctx = ctx.clone();
let jwks_roles = jwks_roles.clone();
let mut conf = conf.as_ref().clone();
let concurrency_token = concurrency_token.clone();
// We only need dbname field for this phase, so set other fields to dummy values
let db = DB::UserDB(Database {
name: dbname.clone(),
owner: "cloud_admin".to_string(),
options: None,
restrict_conn: false,
invalid: false,
});
debug!("Applying per-database phases for Database {:?}", &db);
match &db {
DB::SystemDB => {}
DB::UserDB(db) => {
conf.dbname(db.name.as_str());
}
}
let conf = Arc::new(conf);
let fut = Self::apply_spec_sql_db(
spec.clone(),
conf,
ctx.clone(),
jwks_roles.clone(),
concurrency_token.clone(),
db,
[DropSubscriptionsForDeletedDatabases].to_vec(),
);
Ok(spawn(fut))
})
.collect::<Vec<Result<_, anyhow::Error>>>();
for process in db_processes.into_iter() {
let handle = process?;
if let Err(e) = handle.await? {
// Handle the error case where the database does not exist
// We do not check whether the DB exists or not in the deletion phase,
// so we shouldn't be strict about it in pre-deletion cleanup as well.
if e.to_string().contains("does not exist") {
warn!("Error dropping subscription: {}", e);
} else {
return Err(e);
}
};
}
for phase in [
CreateSuperUser,
DropInvalidDatabases,
@@ -1036,7 +962,7 @@ impl ComputeNode {
.await?;
}
info!("Applying RunInEachDatabase2 phase");
info!("Applying RunInEachDatabase phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
let db_processes = spec
@@ -1071,12 +997,6 @@ impl ComputeNode {
jwks_roles.clone(),
concurrency_token.clone(),
db,
[
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
]
.to_vec(),
);
Ok(spawn(fut))
@@ -1123,13 +1043,16 @@ impl ComputeNode {
jwks_roles: Arc<HashSet<String>>,
concurrency_token: Arc<tokio::sync::Semaphore>,
db: DB,
subphases: Vec<PerDatabasePhase>,
) -> Result<()> {
let _permit = concurrency_token.acquire().await?;
let mut client_conn = None;
for subphase in subphases {
for subphase in [
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
] {
apply_operations(
spec.clone(),
ctx.clone(),

View File

@@ -0,0 +1,606 @@
use std::convert::Infallible;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use crate::installed_extensions;
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
use compute_api::responses::{
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
SetRoleGrantsResponse,
};
use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use metrics::proto::MetricFamily;
use metrics::Encoder;
use metrics::TextEncoder;
use tokio::task;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
use tracing_utils::http::OtelName;
use utils::failpoint_support::failpoints_handler;
use utils::http::error::ApiError;
use utils::http::request::must_get_query_param;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
start_time: state.start_time,
tenant: state
.pspec
.as_ref()
.map(|pspec| pspec.tenant_id.to_string()),
timeline: state
.pspec
.as_ref()
.map(|pspec| pspec.timeline_id.to_string()),
status: state.status,
last_active: state.last_active,
error: state.error.clone(),
}
}
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
//
// NOTE: The URI path is currently included in traces. That's OK because
// it doesn't contain any variable parts or sensitive information. But
// please keep that in mind if you change the routing here.
//
match (req.method(), req.uri().path()) {
// Serialized compute state.
(&Method::GET, "/status") => {
debug!("serving /status GET request");
let state = compute.state.lock().unwrap();
let status_response = status_response_from_state(&state);
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
}
// Startup metrics in JSON format. Keep /metrics reserved for a possible
// future use for Prometheus metrics format.
(&Method::GET, "/metrics.json") => {
info!("serving /metrics.json GET request");
let metrics = compute.state.lock().unwrap().metrics.clone();
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
}
// Prometheus metrics
(&Method::GET, "/metrics") => {
debug!("serving /metrics GET request");
// When we call TextEncoder::encode() below, it will immediately
// return an error if a metric family has no metrics, so we need to
// preemptively filter out metric families with no metrics.
let metrics = installed_extensions::collect()
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
let mut buffer = vec![];
if let Err(err) = encoder.encode(&metrics, &mut buffer) {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
}
match Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
{
Ok(response) => response,
Err(err) => {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Collect Postgres current usage insights
(&Method::GET, "/insights") => {
info!("serving /insights GET request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!("compute is not running, current status: {:?}", status);
error!(msg);
return Response::new(Body::from(msg));
}
let insights = compute.collect_insights().await;
Response::new(Body::from(insights))
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for check_writability request: {:?}",
status
);
error!(msg);
return Response::new(Body::from(msg));
}
let res = crate::checker::check_writability(compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => {
error!("check_writability failed: {}", e);
Response::new(Body::from(e.to_string()))
}
}
}
(&Method::POST, "/extensions") => {
info!("serving /extensions POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for extensions request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
let res = compute
.install_extension(&request.extension, &request.database, request.version)
.await;
match res {
Ok(version) => render_json(Body::from(
serde_json::to_string(&ExtensionInstallResult {
extension: request.extension,
version,
})
.unwrap(),
)),
Err(e) => {
error!("install_extension failed: {}", e);
render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/info") => {
let num_cpus = num_cpus::get_physical();
info!("serving /info GET request. num_cpus: {}", num_cpus);
Response::new(Body::from(
serde_json::json!({
"num_cpus": num_cpus,
})
.to_string(),
))
}
// Accept spec in JSON format and request compute configuration. If
// anything goes wrong after we set the compute status to `ConfigurationPending`
// and update compute state with new spec, we basically leave compute
// in the potentially wrong state. That said, it's control-plane's
// responsibility to watch compute state after reconfiguration request
// and to clean restart in case of errors.
(&Method::POST, "/configure") => {
info!("serving /configure POST request");
match handle_configure_request(req, compute).await {
Ok(msg) => Response::new(Body::from(msg)),
Err((msg, code)) => {
error!("error handling /configure request: {msg}");
render_json_error(&msg, code)
}
}
}
(&Method::POST, "/terminate") => {
info!("serving /terminate POST request");
match handle_terminate_request(compute).await {
Ok(()) => Response::new(Body::empty()),
Err((msg, code)) => {
error!("error handling /terminate request: {msg}");
render_json_error(&msg, code)
}
}
}
(&Method::GET, "/dbs_and_roles") => {
info!("serving /dbs_and_roles GET request",);
match get_dbs_and_roles(compute).await {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(_) => {
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/database_schema") => {
let database = match must_get_query_param(&req, "database") {
Err(e) => return e.into_response(),
Ok(database) => database,
};
info!("serving /database_schema GET request with database: {database}",);
match get_database_schema(compute, &database).await {
Ok(res) => render_plain(Body::wrap_stream(res)),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
render_json_error("database does not exist", StatusCode::NOT_FOUND)
}
Err(e) => {
error!("can't get schema dump: {}", e);
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::POST, "/grants") => {
info!("serving /grants POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for set_role_grants request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
let res = compute
.set_role_grants(
&request.database,
&request.schema,
&request.privileges,
&request.role,
)
.await;
match res {
Ok(()) => render_json(Body::from(
serde_json::to_string(&SetRoleGrantsResponse {
database: request.database,
schema: request.schema,
role: request.role,
privileges: request.privileges,
})
.unwrap(),
)),
Err(e) => render_json_error(
&format!("could not grant role privileges to the schema: {e}"),
// TODO: can we filter on role/schema not found errors
// and return appropriate error code?
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
// get the list of installed extensions
// currently only used in python tests
// TODO: call it from cplane
(&Method::GET, "/installed_extensions") => {
info!("serving /installed_extensions GET request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for extensions request: {:?}",
status
);
error!(msg);
return Response::new(Body::from(msg));
}
let conf = compute.get_conn_conf(None);
let res =
task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
.await
.unwrap();
match res {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(e) => render_json_error(
&format!("could not get list of installed extensions: {}", e),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
(&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
match failpoints_handler(req, CancellationToken::new()).await {
Ok(r) => r,
Err(ApiError::BadRequest(e)) => {
render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
}
Err(_) => {
render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
// don't even try to download extensions
// if no remote storage is configured
if compute.ext_remote_storage.is_none() {
info!("no extensions remote storage configured");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
let mut is_library = false;
if let Some(params) = req.uri().query() {
info!("serving {:?} POST request with params: {}", route, params);
if params == "is_library=true" {
is_library = true;
} else {
let mut resp = Response::new(Body::from("Wrong request parameters"));
*resp.status_mut() = StatusCode::BAD_REQUEST;
return resp;
}
}
let filename = route.split('/').last().unwrap().to_string();
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
// get ext_name and path from spec
// don't lock compute_state for too long
let ext = {
let compute_state = compute.state.lock().unwrap();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
// debug only
info!("spec: {:?}", spec);
let remote_extensions = match spec.remote_extensions.as_ref() {
Some(r) => r,
None => {
info!("no remote extensions spec was provided");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
};
remote_extensions.get_ext(
&filename,
is_library,
&compute.build_tag,
&compute.pgversion,
)
};
match ext {
Ok((ext_name, ext_path)) => {
match compute.download_extension(ext_name, ext_path).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
Err(e) => {
warn!("extension download failed to find extension: {}", e);
let mut resp = Response::new(Body::from("failed to find file"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));
*not_found.status_mut() = StatusCode::NOT_FOUND;
not_found
}
}
}
async fn handle_configure_request(
req: Request<Body>,
compute: &Arc<ComputeNode>,
) -> Result<String, (String, StatusCode)> {
if !compute.live_config_allowed {
return Err((
"live configuration is not allowed for this compute node".to_string(),
StatusCode::PRECONDITION_FAILED,
));
}
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
let spec = request.spec;
let parsed_spec = match ParsedSpec::try_from(spec) {
Ok(ps) => ps,
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
};
// XXX: wrap state update under lock in code blocks. Otherwise,
// we will try to `Send` `mut state` into the spawned thread
// bellow, which will cause error:
// ```
// error: future cannot be sent between threads safely
// ```
{
let mut state = compute.state.lock().unwrap();
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for configuration request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.pspec = Some(parsed_spec);
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
info!("set new spec and notified waiters");
}
// Spawn a blocking thread to wait for compute to become Running.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Running, current status: {:?}",
state.status
);
if state.status == ComputeStatus::Failed {
let err = state.error.as_ref().map_or("unknown error", |x| x);
let msg = format!("compute configuration failed: {:?}", err);
return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
}
}
Ok(())
})
.await
.unwrap()?;
// Return current compute state if everything went well.
let state = compute.state.lock().unwrap().clone();
let status_response = status_response_from_state(&state);
Ok(serde_json::to_string(&status_response).unwrap())
} else {
Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
}
}
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
let error = GenericAPIError {
error: e.to_string(),
};
Response::builder()
.status(status)
.header(CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
fn render_json(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "application/json")
.body(body)
.unwrap()
}
fn render_plain(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "text/plain")
.body(body)
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return Ok(());
}
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for termination request: {}",
state.status
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {:?}",
ComputeStatus::Terminated,
state.status
);
}
Ok(())
})
.await
.unwrap()?;
info!("terminated Postgres");
Ok(())
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {
// this usually binds to both IPv4 and IPv6 on linux
// see e.g. https://github.com/rust-lang/rust/pull/34440
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let make_service = make_service_fn(move |_conn| {
let state = state.clone();
async move {
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
let state = state.clone();
async move {
Ok::<_, Infallible>(
// NOTE: We include the URI path in the string. It
// doesn't contain any variable parts or sensitive
// information in this API.
tracing_utils::http::tracing_handler(
req,
|req| routes(req, &state),
OtelName::UriPath,
)
.await,
)
}
}))
}
});
info!("starting HTTP server on {}", addr);
let server = Server::bind(&addr).serve(make_service);
// Run this server forever
if let Err(e) = server.await {
error!("server error: {}", e);
}
}
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-endpoint".into())
.spawn(move || serve(port, state))?)
}

View File

@@ -1,48 +0,0 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::JsonRejection, FromRequest, Request},
};
use compute_api::responses::GenericAPIError;
use http::StatusCode;
/// Custom `Json` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Json<T>(pub T);
#[async_trait]
impl<S, T> FromRequest<S> for Json<T>
where
axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
match axum::Json::<T>::from_request(req, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Json<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Json<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -1,7 +0,0 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;

View File

@@ -1,48 +0,0 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::PathRejection, FromRequestParts},
};
use compute_api::responses::GenericAPIError;
use http::{request::Parts, StatusCode};
/// Custom `Path` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Path<T>(pub T);
#[async_trait]
impl<S, T> FromRequestParts<S> for Path<T>
where
axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
match axum::extract::Path::<T>::from_request_parts(parts, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_ascii_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Path<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Path<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -1,48 +0,0 @@
use std::ops::{Deref, DerefMut};
use axum::{
async_trait,
extract::{rejection::QueryRejection, FromRequestParts},
};
use compute_api::responses::GenericAPIError;
use http::{request::Parts, StatusCode};
/// Custom `Query` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct Query<T>(pub T);
#[async_trait]
impl<S, T> FromRequestParts<S> for Query<T>
where
axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
S: Send + Sync,
{
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
match axum::extract::Query::<T>::from_request_parts(parts, state).await {
Ok(value) => Ok(Self(value.0)),
Err(rejection) => Err((
rejection.status(),
axum::Json(GenericAPIError {
error: rejection.body_text().to_ascii_lowercase(),
}),
)),
}
}
}
impl<T> Deref for Query<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for Query<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -1,56 +1 @@
use axum::{body::Body, response::Response};
use compute_api::responses::{ComputeStatus, GenericAPIError};
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Serialize;
use tracing::error;
pub use server::launch_http_server;
mod extract;
mod routes;
mod server;
/// Convenience response builder for JSON responses
struct JsonResponse;
impl JsonResponse {
/// Helper for actually creating a response
fn create_response(code: StatusCode, body: impl Serialize) -> Response {
Response::builder()
.status(code)
.header(CONTENT_TYPE.as_str(), "application/json")
.body(Body::from(serde_json::to_string(&body).unwrap()))
.unwrap()
}
/// Create a successful error response
pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
assert!({
let code = code.as_u16();
(200..300).contains(&code)
});
Self::create_response(code, body)
}
/// Create an error response
pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
assert!(code.as_u16() >= 400);
let message = error.to_string();
error!(message);
Self::create_response(code, &GenericAPIError { error: message })
}
/// Create an error response related to the compute being in an invalid state
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
Self::create_response(
StatusCode::PRECONDITION_FAILED,
&GenericAPIError {
error: format!("invalid compute status: {status}"),
},
)
}
}
pub mod api;

View File

@@ -37,7 +37,7 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/metrics:
/metrics
get:
tags:
- Info

View File

@@ -1,20 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
/// Check that the compute is currently running.
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match check_writability(&compute).await {
Ok(_) => JsonResponse::success(StatusCode::OK, true),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -1,91 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::ConfigurationRequest,
responses::{ComputeStatus, ComputeStatusResponse},
};
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::{
compute::{ComputeNode, ParsedSpec},
http::{extract::Json, JsonResponse},
};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
// update compute state with new spec, we basically leave compute in the
// potentially wrong state. That said, it's control-plane's responsibility to
// watch compute state after reconfiguration request and to clean restart in
// case of errors.
pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),
);
}
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
Ok(p) => p,
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
};
// XXX: wrap state update under lock in a code block. Otherwise, we will try
// to `Send` `mut state` into the spawned thread bellow, which will cause
// the following rustc error:
//
// error: future cannot be sent between threads safely
{
let mut state = compute.state.lock().unwrap();
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status);
}
state.pspec = Some(pspec);
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
}
// Spawn a blocking thread to wait for compute to become Running. This is
// needed to do not block the main pool of workers and be able to serve
// other requests while some particular request is waiting for compute to
// finish configuration.
let c = compute.clone();
let completed = task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {}",
ComputeStatus::Running,
state.status
);
if state.status == ComputeStatus::Failed {
let err = state.error.as_ref().map_or("unknown error", |x| x);
let msg = format!("compute configuration failed: {:?}", err);
return Err(msg);
}
}
Ok(())
})
.await
.unwrap();
if let Err(e) = completed {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
}
// Return current compute state if everything went well.
let state = compute.state.lock().unwrap().clone();
let body = ComputeStatusResponse::from(&state);
JsonResponse::success(StatusCode::OK, body)
}

View File

@@ -1,34 +0,0 @@
use std::sync::Arc;
use axum::{body::Body, extract::State, response::Response};
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Deserialize;
use crate::{
catalog::{get_database_schema, SchemaDumpError},
compute::ComputeNode,
http::{extract::Query, JsonResponse},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct DatabaseSchemaParams {
database: String,
}
/// Get a schema dump of the requested database.
pub(in crate::http) async fn get_schema_dump(
params: Query<DatabaseSchemaParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
match get_database_schema(&compute, &params.database).await {
Ok(schema) => Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE.as_str(), "application/json")
.body(Body::from_stream(schema))
.unwrap(),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
}
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -1,16 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
/// Get the databases and roles from the compute.
pub(in crate::http) async fn get_catalog_objects(
State(compute): State<Arc<ComputeNode>>,
) -> Response {
match get_dbs_and_roles(&compute).await {
Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
}
}

View File

@@ -1,67 +0,0 @@
use std::sync::Arc;
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use http::StatusCode;
use serde::Deserialize;
use crate::{
compute::ComputeNode,
http::{
extract::{Path, Query},
JsonResponse,
},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct ExtensionServerParams {
is_library: Option<bool>,
}
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
);
}
let ext = {
let state = compute.state.lock().unwrap();
let pspec = state.pspec.as_ref().unwrap();
let spec = &pspec.spec;
let remote_extensions = match spec.remote_extensions.as_ref() {
Some(r) => r,
None => {
return JsonResponse::error(
StatusCode::CONFLICT,
"information about remote extensions is unavailable",
);
}
};
remote_extensions.get_ext(
&filename,
params.is_library.unwrap_or(false),
&compute.build_tag,
&compute.pgversion,
)
};
match ext {
Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
Ok(_) => StatusCode::OK.into_response(),
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
},
Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
}
}

View File

@@ -1,45 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::ExtensionInstallRequest,
responses::{ComputeStatus, ExtensionInstallResponse},
};
use http::StatusCode;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Install a extension.
pub(in crate::http) async fn install_extension(
State(compute): State<Arc<ComputeNode>>,
request: Json<ExtensionInstallRequest>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match compute
.install_extension(
&request.extension,
&request.database,
request.version.to_string(),
)
.await
{
Ok(version) => JsonResponse::success(
StatusCode::CREATED,
Some(ExtensionInstallResponse {
extension: request.extension.clone(),
version,
}),
),
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to install extension: {e}"),
),
}
}

View File

@@ -1,35 +0,0 @@
use axum::response::{IntoResponse, Response};
use http::StatusCode;
use tracing::info;
use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
use crate::http::{extract::Json, JsonResponse};
/// Configure failpoints for testing purposes.
pub(in crate::http) async fn configure_failpoints(
failpoints: Json<ConfigureFailpointsRequest>,
) -> Response {
if !fail::has_failpoints() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"Cannot manage failpoints because neon was compiled without failpoints support",
);
}
for fp in &*failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(e) = cfg_result {
return JsonResponse::error(
StatusCode::BAD_REQUEST,
format!("failed to configure failpoints: {e}"),
);
}
}
StatusCode::OK.into_response()
}

View File

@@ -1,48 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::{
requests::SetRoleGrantsRequest,
responses::{ComputeStatus, SetRoleGrantsResponse},
};
use http::StatusCode;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Add grants for a role.
pub(in crate::http) async fn add_grant(
State(compute): State<Arc<ComputeNode>>,
request: Json<SetRoleGrantsRequest>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
match compute
.set_role_grants(
&request.database,
&request.schema,
&request.privileges,
&request.role,
)
.await
{
Ok(()) => JsonResponse::success(
StatusCode::CREATED,
Some(SetRoleGrantsResponse {
database: request.database.clone(),
schema: request.schema.clone(),
role: request.role.clone(),
privileges: request.privileges.clone(),
}),
),
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to grant role privileges to the schema: {e}"),
),
}
}

View File

@@ -1,11 +0,0 @@
use axum::response::Response;
use compute_api::responses::InfoResponse;
use http::StatusCode;
use crate::http::JsonResponse;
/// Get information about the physical characteristics about the compute.
pub(in crate::http) async fn get_info() -> Response {
let num_cpus = num_cpus::get_physical();
JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
}

View File

@@ -1,18 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Collect current Postgres usage insights.
pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
let insights = compute.collect_insights().await;
JsonResponse::success(StatusCode::OK, insights)
}

View File

@@ -1,33 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use tokio::task;
use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
/// Get a list of installed extensions.
pub(in crate::http) async fn get_installed_extensions(
State(compute): State<Arc<ComputeNode>>,
) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
}
let conf = compute.get_conn_conf(None);
let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
.await
.unwrap();
match res {
Ok(installed_extensions) => {
JsonResponse::success(StatusCode::OK, Some(installed_extensions))
}
Err(e) => JsonResponse::error(
StatusCode::INTERNAL_SERVER_ERROR,
format!("failed to get list of installed extensions: {e}"),
),
}
}

View File

@@ -1,32 +0,0 @@
use axum::{body::Body, response::Response};
use http::header::CONTENT_TYPE;
use http::StatusCode;
use metrics::proto::MetricFamily;
use metrics::Encoder;
use metrics::TextEncoder;
use crate::{http::JsonResponse, installed_extensions};
/// Expose Prometheus metrics.
pub(in crate::http) async fn get_metrics() -> Response {
// When we call TextEncoder::encode() below, it will immediately return an
// error if a metric family has no metrics, so we need to preemptively
// filter out metric families with no metrics.
let metrics = installed_extensions::collect()
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
let mut buffer = vec![];
if let Err(e) = encoder.encode(&metrics, &mut buffer) {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
}
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
.unwrap()
}

View File

@@ -1,12 +0,0 @@
use std::sync::Arc;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Get startup metrics.
pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
let metrics = compute.state.lock().unwrap().metrics.clone();
JsonResponse::success(StatusCode::OK, metrics)
}

View File

@@ -1,38 +0,0 @@
use compute_api::responses::ComputeStatusResponse;
use crate::compute::ComputeState;
pub(in crate::http) mod check_writability;
pub(in crate::http) mod configure;
pub(in crate::http) mod database_schema;
pub(in crate::http) mod dbs_and_roles;
pub(in crate::http) mod extension_server;
pub(in crate::http) mod extensions;
pub(in crate::http) mod failpoints;
pub(in crate::http) mod grants;
pub(in crate::http) mod info;
pub(in crate::http) mod insights;
pub(in crate::http) mod installed_extensions;
pub(in crate::http) mod metrics;
pub(in crate::http) mod metrics_json;
pub(in crate::http) mod status;
pub(in crate::http) mod terminate;
impl From<&ComputeState> for ComputeStatusResponse {
fn from(state: &ComputeState) -> Self {
ComputeStatusResponse {
start_time: state.start_time,
tenant: state
.pspec
.as_ref()
.map(|pspec| pspec.tenant_id.to_string()),
timeline: state
.pspec
.as_ref()
.map(|pspec| pspec.timeline_id.to_string()),
status: state.status,
last_active: state.last_active,
error: state.error.clone(),
}
}
}

View File

@@ -1,14 +0,0 @@
use std::{ops::Deref, sync::Arc};
use axum::{extract::State, http::StatusCode, response::Response};
use compute_api::responses::ComputeStatusResponse;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Retrieve the state of the comute.
pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
let state = compute.state.lock().unwrap();
let body = ComputeStatusResponse::from(state.deref());
JsonResponse::success(StatusCode::OK, body)
}

View File

@@ -1,58 +0,0 @@
use std::sync::Arc;
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::{
compute::{forward_termination_signal, ComputeNode},
http::JsonResponse,
};
/// Terminate the compute.
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return StatusCode::CREATED.into_response();
}
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status);
}
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become {}, current status: {:?}",
ComputeStatus::Terminated,
state.status
);
}
})
.await
.unwrap();
info!("terminated Postgres");
StatusCode::OK.into_response()
}

View File

@@ -1,165 +0,0 @@
use std::{
net::{IpAddr, Ipv6Addr, SocketAddr},
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
thread,
time::Duration,
};
use anyhow::Result;
use axum::{
response::{IntoResponse, Response},
routing::{get, post},
Router,
};
use http::StatusCode;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
trace::TraceLayer,
};
use tracing::{debug, error, info, Span};
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
terminate,
};
use crate::compute::ComputeNode;
async fn handle_404() -> Response {
StatusCode::NOT_FOUND.into_response()
}
#[derive(Clone, Default)]
struct ComputeMakeRequestId(Arc<AtomicU64>);
impl MakeRequestId for ComputeMakeRequestId {
fn make_request_id<B>(
&mut self,
_request: &http::Request<B>,
) -> Option<tower_http::request_id::RequestId> {
let request_id = self
.0
.fetch_add(1, Ordering::SeqCst)
.to_string()
.parse()
.unwrap();
Some(RequestId::new(request_id))
}
}
/// Run the HTTP server and wait on it forever.
#[tokio::main]
async fn serve(port: u16, compute: Arc<ComputeNode>) {
const X_REQUEST_ID: &str = "x-request-id";
let mut app = Router::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route(
"/extension_server/*filename",
post(extension_server::download_extension),
)
.route("/extensions", post(extensions::install_extension))
.route("/grants", post(grants::add_grant))
.route("/info", get(info_route::get_info))
.route("/insights", get(insights::get_insights))
.route(
"/installed_extensions",
get(installed_extensions::get_installed_extensions),
)
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.fallback(handle_404)
.layer(
ServiceBuilder::new()
.layer(SetRequestIdLayer::x_request_id(
ComputeMakeRequestId::default(),
))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
.with_state(compute);
// Add in any testing support
if cfg!(feature = "testing") {
use super::routes::failpoints;
app = app.route("/failpoints", post(failpoints::configure_failpoints))
}
// This usually binds to both IPv4 and IPv6 on Linux, see
// https://github.com/rust-lang/rust/pull/34440 for more information
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let listener = match TcpListener::bind(&addr).await {
Ok(listener) => listener,
Err(e) => {
error!(
"failed to bind the compute_ctl HTTP server to port {}: {}",
port, e
);
return;
}
};
if let Ok(local_addr) = listener.local_addr() {
info!("compute_ctl HTTP server listening on {}", local_addr);
} else {
info!("compute_ctl HTTP server listening on port {}", port);
}
if let Err(e) = axum::serve(listener, app).await {
error!("compute_ctl HTTP server error: {}", e);
}
}
/// Launch a separate HTTP server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-server".into())
.spawn(move || serve(port, state))?)
}

View File

@@ -3,6 +3,8 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
extern crate hyper0 as hyper;
pub mod checker;
pub mod config;
pub mod configurator;

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result};
use fail::fail_point;
use postgres::{Client, Transaction};
use postgres::Client;
use tracing::info;
/// Runs a series of migrations on a target database
@@ -20,9 +20,11 @@ impl<'m> MigrationRunner<'m> {
/// Get the current value neon_migration.migration_id
fn get_migration_id(&mut self) -> Result<i64> {
let query = "SELECT id FROM neon_migration.migration_id";
let row = self
.client
.query_one("SELECT id FROM neon_migration.migration_id", &[])?;
.query_one(query, &[])
.context("run_migrations get migration_id")?;
Ok(row.get::<&str, i64>("id"))
}
@@ -32,7 +34,7 @@ impl<'m> MigrationRunner<'m> {
/// This function has a fail point called compute-migration, which can be
/// used if you would like to fail the application of a series of migrations
/// at some point.
fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
// We use this fail point in order to check that failing in the
// middle of applying a series of migrations fails in an expected
// manner
@@ -53,11 +55,12 @@ impl<'m> MigrationRunner<'m> {
}
}
txn.query(
"UPDATE neon_migration.migration_id SET id = $1",
&[&migration_id],
)
.with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
self.client
.query(
"UPDATE neon_migration.migration_id SET id = $1",
&[&migration_id],
)
.context("run_migrations update id")?;
Ok(())
}
@@ -78,50 +81,53 @@ impl<'m> MigrationRunner<'m> {
Ok(())
}
/// Run an individual migration
fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", migration_id);
// Even though we are skipping the migration, updating the
// migration ID should help keep logic easy to understand when
// trying to understand the state of a cluster.
Self::update_migration_id(txn, migration_id)?;
} else {
info!("Running migration id={}:\n{}\n", migration_id, migration);
txn.simple_query(migration)
.with_context(|| format!("apply migration {migration_id}"))?;
Self::update_migration_id(txn, migration_id)?;
}
Ok(())
}
/// Run the configured set of migrations
/// Run the configrured set of migrations
pub fn run_migrations(mut self) -> Result<()> {
self.prepare_database()
.context("prepare database to handle migrations")?;
self.prepare_database()?;
let mut current_migration = self.get_migration_id()? as usize;
while current_migration < self.migrations.len() {
// The index lags the migration ID by 1, so the current migration
// ID is also the next index
let migration_id = (current_migration + 1) as i64;
macro_rules! migration_id {
($cm:expr) => {
($cm + 1) as i64
};
}
let mut txn = self
.client
.transaction()
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
let migration = self.migrations[current_migration];
Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
.with_context(|| format!("running migration {migration_id}"))?;
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", migration_id!(current_migration));
txn.commit()
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
// Even though we are skipping the migration, updating the
// migration ID should help keep logic easy to understand when
// trying to understand the state of a cluster.
self.update_migration_id(migration_id!(current_migration))?;
} else {
info!(
"Running migration id={}:\n{}\n",
migration_id!(current_migration),
migration
);
info!("Finished migration id={}", migration_id);
self.client
.simple_query("BEGIN")
.context("begin migration")?;
self.client.simple_query(migration).with_context(|| {
format!(
"run_migrations migration id={}",
migration_id!(current_migration)
)
})?;
self.update_migration_id(migration_id!(current_migration))?;
self.client
.simple_query("COMMIT")
.context("commit migration")?;
info!("Finished migration id={}", migration_id!(current_migration));
}
current_migration += 1;
}

View File

@@ -47,7 +47,6 @@ pub enum PerDatabasePhase {
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
DropSubscriptionsForDeletedDatabases,
}
#[derive(Clone, Debug)]
@@ -75,7 +74,7 @@ pub struct MutableApplyContext {
pub dbs: HashMap<String, Database>,
}
/// Apply the operations that belong to the given spec apply phase.
/// Appply the operations that belong to the given spec apply phase.
///
/// Commands within a single phase are executed in order of Iterator yield.
/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -327,12 +326,13 @@ async fn get_operations<'a>(
// Use FORCE to drop database even if there are active connections.
// We run this from `cloud_admin`, so it should have enough privileges.
//
// NB: there could be other db states, which prevent us from dropping
// the database. For example, if db is used by any active subscription
// or replication slot.
// Such cases are handled in the DropSubscriptionsForDeletedDatabases
// phase. We do all the cleanup before actually dropping the database.
// TODO: deal with it once we allow logical replication. Proper fix should
// involve returning an error code to the control plane, so it could
// figure out that this is a non-retryable error, return it to the user
// and fail operation permanently.
let drop_db_query: String = format!(
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
&op.name.pg_quote()
@@ -444,30 +444,6 @@ async fn get_operations<'a>(
}
ApplySpecPhase::RunInEachDatabase { db, subphase } => {
match subphase {
PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
match &db {
DB::UserDB(db) => {
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscription_for_drop_dbs.sql"),
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
query: drop_subscription_query,
comment: Some(format!(
"optionally dropping subscriptions for DB {}",
db.name,
)),
}]
.into_iter();
Ok(Box::new(operations))
}
// skip this cleanup for the system databases
// because users can't drop them
DB::SystemDB => Ok(Box::new(empty())),
}
}
PerDatabasePhase::DeleteDBRoleReferences => {
let ctx = ctx.read().await;
@@ -498,19 +474,7 @@ async fn get_operations<'a>(
),
comment: None,
},
// Revoke some potentially blocking privileges (Neon-specific currently)
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
role_name = quoted,
),
comment: None,
},
// This now will only drop privileges of the role
// TODO: this is obviously not 100% true because of the above case,
// there could be still some privileges that are not revoked. Maybe this
// only drops privileges that were granted *by this* role, not *to this* role,
// but this has to be checked.
Operation {
query: format!("DROP OWNED BY {}", quoted),
comment: None,

View File

@@ -1,11 +0,0 @@
DO $$
DECLARE
subname TEXT;
BEGIN
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
$$;

View File

@@ -1,28 +0,0 @@
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
SELECT schema_name
FROM information_schema.schemata
-- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
-- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
-- See https://github.com/neondatabase/cloud/issues/13582 for the context.
-- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
EXECUTE revoke_query;
END LOOP;
END;
$$;
RESET ROLE;

View File

@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
use crate::storage_controller::StorageController;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
@@ -739,7 +739,7 @@ impl Endpoint {
}
// Call the /status HTTP API
pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
pub async fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::Client::new();
let response = client

View File

@@ -483,6 +483,7 @@ impl LocalEnv {
.iter()
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
.map(|&(_, timeline_id)| timeline_id)
.map(TimelineId::from)
}
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {

View File

@@ -822,7 +822,10 @@ impl StorageController {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest { node_id }),
Some(TenantShardMigrateRequest {
tenant_shard_id,
node_id,
}),
)
.await
}

View File

@@ -112,13 +112,6 @@ enum Command {
#[arg(long)]
node: NodeId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Cancel any ongoing reconciliation for this shard
TenantShardCancelReconcile {
#[arg(long)]
@@ -547,7 +540,10 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest { node_id: node };
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -557,20 +553,6 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
Some(req),
)
.await?;
}
Command::TenantShardCancelReconcile { tenant_shard_id } => {
storcon_client
.dispatch::<(), ()>(
@@ -933,7 +915,10 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest { node_id: mv.to }),
Some(TenantShardMigrateRequest {
tenant_shard_id: mv.tenant_shard_id,
node_id: mv.to,
}),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1050,15 +1035,7 @@ async fn main() -> anyhow::Result<()> {
resp.sort_by(|a, b| a.id.cmp(&b.id));
let mut table = comfy_table::Table::new();
table.set_header([
"Id",
"Version",
"Host",
"Port",
"Http Port",
"AZ Id",
"Scheduling",
]);
table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
for sk in resp {
table.add_row([
format!("{}", sk.id),
@@ -1066,8 +1043,7 @@ async fn main() -> anyhow::Result<()> {
sk.host,
format!("{}", sk.port),
format!("{}", sk.http_port),
sk.availability_zone_id.clone(),
String::from(sk.scheduling_policy),
sk.availability_zone_id.to_string(),
]);
}
println!("{table}");

View File

@@ -7,11 +7,15 @@ Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
And additional intermediate image:
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
## Build pipeline
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
2. `neondatabase/neon`

View File

@@ -15,17 +15,6 @@ pub struct GenericAPIError {
pub error: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct InfoResponse {
pub num_cpus: usize,
}
#[derive(Debug, Clone, Serialize)]
pub struct ExtensionInstallResponse {
pub extension: PgIdent,
pub version: ExtVersion,
}
/// Response of the /status API
#[derive(Serialize, Debug, Deserialize)]
#[serde(rename_all = "snake_case")]
@@ -39,6 +28,16 @@ pub struct ComputeStatusResponse {
pub error: Option<String>,
}
#[derive(Deserialize, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
#[serde(serialize_with = "rfc3339_serialize")]
pub last_active: Option<DateTime<Utc>>,
pub error: Option<String>,
}
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
@@ -79,7 +78,7 @@ impl Display for ComputeStatus {
}
}
pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{

View File

@@ -179,6 +179,7 @@ pub struct TenantDescribeResponseShard {
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
@@ -319,38 +320,6 @@ impl From<NodeSchedulingPolicy> for String {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Disabled,
Decomissioned,
}
impl FromStr for SkSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"disabled" => Self::Disabled,
"decomissioned" => Self::Decomissioned,
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
})
}
}
impl From<SkSchedulingPolicy> for String {
fn from(value: SkSchedulingPolicy) -> String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Disabled => "disabled",
Decomissioned => "decomissioned",
}
.to_string()
}
}
/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
/// to create secondary locations.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -367,16 +336,6 @@ pub enum PlacementPolicy {
Detached,
}
impl PlacementPolicy {
pub fn want_secondaries(&self) -> usize {
match self {
PlacementPolicy::Attached(secondary_count) => *secondary_count,
PlacementPolicy::Secondary => 1,
PlacementPolicy::Detached => 0,
}
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
@@ -428,7 +387,6 @@ pub struct SafekeeperDescribeResponse {
pub port: i32,
pub http_port: i32,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
#[cfg(test)]

View File

@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
/// Non inherited range for vectored get.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
impl Key {
// AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,42 +714,7 @@ impl Key {
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(self) -> bool {
if self.is_sparse() {
self.is_inherited_sparse_key()
} else {
!NON_INHERITED_RANGE.contains(&self)
}
}
#[inline(always)]
pub fn is_sparse(self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
}
/// Check if the key belongs to the inherited keyspace.
fn is_inherited_sparse_key(self) -> bool {
debug_assert!(self.is_sparse());
self.field1 == RELATION_SIZE_PREFIX
}
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
Key {
field1: AUX_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REPL_ORIGIN_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
}
#[inline(always)]

View File

@@ -272,8 +272,6 @@ pub struct CompactInfoResponse {
pub compact_key_range: Option<CompactKeyRange>,
pub compact_lsn_range: Option<CompactLsnRange>,
pub sub_compaction: bool,
pub running: bool,
pub job_id: usize,
}
#[derive(Serialize, Deserialize, Clone)]
@@ -1462,91 +1460,75 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
// difference in the responses between V1 and V2.
//
// V3 version of protocol adds request ID to all requests. This request ID is also included in response
// as well as other fields from requests, which allows to verify that we receive response for our request.
// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
//
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[derive(Clone, Copy)]
pub enum PagestreamProtocolVersion {
V2,
V3,
}
pub type RequestId = u64;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamRequest {
pub reqid: RequestId,
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct PagestreamExistsRequest {
pub hdr: PagestreamRequest,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest {
pub hdr: PagestreamRequest,
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest {
pub hdr: PagestreamRequest,
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub rel: RelTag,
pub blkno: u32,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest {
pub hdr: PagestreamRequest,
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest {
pub hdr: PagestreamRequest,
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub kind: u8,
pub segno: u32,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub req: PagestreamExistsRequest,
pub exists: bool,
}
#[derive(Debug)]
pub struct PagestreamNblocksResponse {
pub req: PagestreamNblocksRequest,
pub n_blocks: u32,
}
#[derive(Debug)]
pub struct PagestreamGetPageResponse {
pub req: PagestreamGetPageRequest,
pub page: Bytes,
}
#[derive(Debug)]
pub struct PagestreamGetSlruSegmentResponse {
pub req: PagestreamGetSlruSegmentRequest,
pub segment: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub req: PagestreamRequest,
pub message: String,
}
#[derive(Debug)]
pub struct PagestreamDbSizeResponse {
pub req: PagestreamDbSizeRequest,
pub db_size: i64,
}
@@ -1563,16 +1545,15 @@ pub struct TenantHistorySize {
impl PagestreamFeMessage {
/// Serialize a compute -> pageserver message. This is currently only used in testing
/// tools. Always uses protocol version 3.
/// tools. Always uses protocol version 2.
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1581,9 +1562,8 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1592,9 +1572,8 @@ impl PagestreamFeMessage {
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1604,17 +1583,15 @@ impl PagestreamFeMessage {
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
@@ -1623,35 +1600,21 @@ impl PagestreamFeMessage {
bytes.into()
}
pub fn parse<R: std::io::Read>(
body: &mut R,
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
let (reqid, request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
0,
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V3 => (
body.read_u64::<BigEndian>()?,
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
};
// these two fields are the same for every request type
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
request_lsn,
not_modified_since,
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1660,11 +1623,8 @@ impl PagestreamFeMessage {
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
request_lsn,
not_modified_since,
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1673,11 +1633,8 @@ impl PagestreamFeMessage {
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
request_lsn,
not_modified_since,
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1687,20 +1644,14 @@ impl PagestreamFeMessage {
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
request_lsn,
not_modified_since,
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
request_lsn,
not_modified_since,
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
@@ -1711,114 +1662,43 @@ impl PagestreamFeMessage {
}
impl PagestreamBeMessage {
pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
use PagestreamBeMessageTag as Tag;
match protocol_version {
PagestreamProtocolVersion::V2 => {
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put(&resp.page[..])
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u8(resp.exists as u8);
}
PagestreamProtocolVersion::V3 => {
match self {
Self::Exists(resp) => {
bytes.put_u8(Tag::Exists as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u32(resp.n_blocks);
}
Self::Nblocks(resp) => {
bytes.put_u8(Tag::Nblocks as u8);
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.rel.spcnode);
bytes.put_u32(resp.req.rel.dbnode);
bytes.put_u32(resp.req.rel.relnode);
bytes.put_u8(resp.req.rel.forknum);
bytes.put_u32(resp.req.blkno);
bytes.put(&resp.page[..])
}
Self::GetPage(resp) => {
bytes.put_u8(Tag::GetPage as u8);
bytes.put(&resp.page[..]);
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put_u64(resp.req.reqid);
bytes.put_u64(resp.req.request_lsn.0);
bytes.put_u64(resp.req.not_modified_since.0);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u32(resp.req.dbnode);
bytes.put_i64(resp.db_size);
}
Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u8(resp.req.kind);
bytes.put_u32(resp.req.segno);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
bytes.into()
}
@@ -1830,131 +1710,38 @@ impl PagestreamBeMessage {
let ok =
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
Tag::Exists => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let exists = buf.read_u8()? != 0;
let exists = buf.read_u8()?;
Self::Exists(PagestreamExistsResponse {
req: PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
},
exists,
exists: exists != 0,
})
}
Tag::Nblocks => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let n_blocks = buf.read_u32::<BigEndian>()?;
Self::Nblocks(PagestreamNblocksResponse {
req: PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
},
n_blocks,
})
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
}
Tag::GetPage => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let rel = RelTag {
spcnode: buf.read_u32::<BigEndian>()?,
dbnode: buf.read_u32::<BigEndian>()?,
relnode: buf.read_u32::<BigEndian>()?,
forknum: buf.read_u8()?,
};
let blkno = buf.read_u32::<BigEndian>()?;
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
buf.read_exact(&mut page)?;
Self::GetPage(PagestreamGetPageResponse {
req: PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel,
blkno,
},
page: page.into(),
})
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
}
Tag::Error => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let mut msg = Vec::new();
buf.read_until(0, &mut msg)?;
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
let rust_str = cstring.to_str()?;
Self::Error(PagestreamErrorResponse {
req: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: rust_str.to_owned(),
})
}
Tag::DbSize => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let dbnode = buf.read_u32::<BigEndian>()?;
let db_size = buf.read_i64::<BigEndian>()?;
Self::DbSize(PagestreamDbSizeResponse {
req: PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
dbnode,
},
db_size,
})
Self::DbSize(PagestreamDbSizeResponse { db_size })
}
Tag::GetSlruSegment => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let kind = buf.read_u8()?;
let segno = buf.read_u32::<BigEndian>()?;
let n_blocks = buf.read_u32::<BigEndian>()?;
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
buf.read_exact(&mut segment)?;
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
req: PagestreamGetSlruSegmentRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
kind,
segno,
},
segment: segment.into(),
})
}
@@ -1993,11 +1780,8 @@ mod tests {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -2006,11 +1790,8 @@ mod tests {
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
},
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -2019,11 +1800,8 @@ mod tests {
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -2033,19 +1811,14 @@ mod tests {
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
},
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed =
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
.unwrap();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
assert!(msg == reconstructed);
}
}

View File

@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
impl ProtocolVersion {
pub const fn new(major: u16, minor: u16) -> Self {
Self(((major as u32) << 16) | minor as u32)
Self((major as u32) << 16 | minor as u32)
}
pub const fn minor(self) -> u16 {
self.0 as u16

View File

@@ -115,15 +115,13 @@ fn default_max_keys_per_list_response() -> Option<i32> {
}
fn default_azure_conn_pool_size() -> usize {
// By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
// Conservative default: no connection pooling. At time of writing this is the Azure
// SDK's default as well, due to historic reports of hard-to-reproduce issues
// (https://github.com/hyperium/hyper/issues/2312)
//
// However, using connection pooling is important to avoid exhausting client ports when
// doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
//
// We therefore enable a modest pool size by default: this may be configured to zero if
// issues like the alleged upstream hyper issue appear.
8
0
}
impl Debug for S3Config {

View File

@@ -38,6 +38,7 @@ pub mod http;
use opentelemetry::trace::TracerProvider;
use opentelemetry::KeyValue;
use opentelemetry_sdk::Resource;
use tracing::Subscriber;
use tracing_subscriber::registry::LookupSpan;
use tracing_subscriber::Layer;
@@ -120,10 +121,7 @@ where
S: Subscriber + for<'span> LookupSpan<'span>,
{
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.build()
.expect("could not initialize opentelemetry exporter");
let exporter = opentelemetry_otlp::new_exporter().http();
// TODO: opentelemetry::global::set_error_handler() with custom handler that
// bypasses default tracing layers, but logs regular looking log
@@ -134,13 +132,17 @@ where
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
);
let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
.with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
.with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)]))
.build()
let tracer = opentelemetry_otlp::new_pipeline()
.tracing()
.with_exporter(exporter)
.with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)]),
))
.install_batch(opentelemetry_sdk::runtime::Tokio)
.expect("could not initialize opentelemetry exporter")
.tracer("global");
tracing_opentelemetry::layer().with_tracer(tracer)

View File

@@ -26,7 +26,6 @@ git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper0 = { workspace = true, features = ["full"] }
inferno.workspace = true
itertools.workspace = true
fail.workspace = true
futures = { workspace = true }

View File

@@ -112,9 +112,9 @@ impl Serialize for Generation {
// We should never be asked to serialize a None. Structures
// that include an optional generation should convert None to an
// Option<Generation>::None
Err(serde::ser::Error::custom(format!(
"Tried to serialize invalid generation ({self:?})"
)))
Err(serde::ser::Error::custom(
"Tried to serialize invalid generation ({self})",
))
}
}
}

View File

@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
use regex::Regex;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{mpsc, Mutex, Notify};
use tokio::sync::{mpsc, Mutex};
use tokio_stream::wrappers::ReceiverStream;
use tokio_util::io::ReaderStream;
use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,53 +350,33 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
};
let seconds = match parse_query_param(&req, "seconds")? {
None => 5,
Some(seconds @ 1..=60) => seconds,
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
Some(seconds @ 1..=30) => seconds,
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
};
let frequency_hz = match parse_query_param(&req, "frequency")? {
None => 99,
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
Some(frequency) => frequency,
};
let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
// Only allow one profiler at a time.
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
let _lock = PROFILE_LOCK
.try_lock()
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
// Take the profile.
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
let report = {
// Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
// Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
// for a lock(), to avoid races where the notify isn't currently awaited.
let _lock = loop {
match PROFILE_LOCK.try_lock() {
Ok(lock) => break lock,
Err(_) if force => PROFILE_CANCEL.notify_waiters(),
Err(_) => {
return Err(ApiError::Conflict(
"profiler already running (use ?force=true to cancel it)".into(),
))
}
}
tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
};
let report = tokio::task::spawn_blocking(move || {
let guard = ProfilerGuardBuilder::default()
.frequency(frequency_hz)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()
.map_err(|err| ApiError::InternalServerError(err.into()))?;
tokio::select! {
_ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
_ = PROFILE_CANCEL.notified() => {},
};
guard
.report()
.build()
.map_err(|err| ApiError::InternalServerError(err.into()))?
};
.build()?;
std::thread::sleep(Duration::from_secs(seconds));
guard.report().build()
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
// Return the report in the requested format.
match format {
@@ -437,7 +417,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
enum Format {
Jemalloc,
Pprof,
Svg,
}
// Parameters.
@@ -445,24 +424,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
None => Format::Pprof,
Some("jemalloc") => Format::Jemalloc,
Some("pprof") => Format::Pprof,
Some("svg") => Format::Svg,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
// Functions and mappings to strip when symbolizing pprof profiles. If true,
// also remove child frames.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
// Obtain profiler handle.
let mut prof_ctl = jemalloc_pprof::PROF_CTL
.as_ref()
@@ -500,9 +464,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
// Functions to strip from profiles. If true, also remove child frames.
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let profile = pprof::strip_locations(
profile,
&["libc", "libgcc", "pthread", "vdso"],
&STRIP_FUNCTIONS,
);
pprof::encode(&profile)
})
.await
@@ -515,27 +494,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
.body(Body::from(data))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();
pprof::flamegraph(profile, &mut opts)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}
}

View File

@@ -260,7 +260,7 @@ impl FromStr for Lsn {
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
} else {
Err(LsnParseError)
}

View File

@@ -1,9 +1,8 @@
use anyhow::bail;
use flate2::write::{GzDecoder, GzEncoder};
use flate2::Compression;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use pprof::protos::{Function, Line, Message as _, Profile};
use regex::Regex;
use std::borrow::Cow;
@@ -189,59 +188,3 @@ pub fn strip_locations(
profile
}
/// Generates an SVG flamegraph from a symbolized pprof profile.
pub fn flamegraph(
profile: Profile,
opts: &mut inferno::flamegraph::Options,
) -> anyhow::Result<Vec<u8>> {
if profile.mapping.iter().any(|m| !m.has_functions) {
bail!("profile not symbolized");
}
// Index locations, functions, and strings.
let locations: HashMap<u64, Location> =
profile.location.into_iter().map(|l| (l.id, l)).collect();
let functions: HashMap<u64, Function> =
profile.function.into_iter().map(|f| (f.id, f)).collect();
let strings = profile.string_table;
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
// since inferno expects it bottom-up.
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
for sample in profile.sample {
let mut stack = Vec::with_capacity(sample.location_id.len());
for location in sample.location_id.into_iter().rev() {
let Some(location) = locations.get(&location) else {
bail!("missing location {location}");
};
for line in location.line.iter().rev() {
let Some(function) = functions.get(&line.function_id) else {
bail!("missing function {}", line.function_id);
};
let Some(name) = strings.get(function.name as usize) else {
bail!("missing string {}", function.name);
};
stack.push(name.as_str());
}
}
let Some(&value) = sample.value.first() else {
bail!("missing value");
};
*stacks.entry(stack).or_default() += value;
}
// Construct stack lines for inferno.
let lines = stacks
.into_iter()
.map(|(stack, value)| (stack.into_iter().join(";"), value))
.map(|(stack, value)| format!("{stack} {value}"))
.sorted()
.collect_vec();
// Construct the flamegraph.
let mut bytes = Vec::new();
let lines = lines.iter().map(|line| line.as_str());
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
Ok(bytes)
}

View File

@@ -96,11 +96,7 @@ impl<T: Send> Sender<T> {
}
}
State::SenderWaitsForReceiverToConsume(_data) => {
// SAFETY: send is single threaded due to `&mut self` requirement,
// therefore register is not concurrent.
unsafe {
self.state.wake_sender.register(cx.waker());
}
// Really, we shouldn't be polled until receiver has consumed and wakes us.
Poll::Pending
}
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
@@ -453,38 +449,4 @@ mod tests {
let err = recv_task.await.unwrap().expect_err("should error");
assert!(matches!(err, RecvError::SenderGone));
}
#[tokio::test(start_paused = true)]
async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
let (mut sender, receiver) = channel();
let state = receiver.state.clone();
sender.send((), |_, _| unreachable!()).await.unwrap();
assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
let unmergeable = sender.send((), |_, _| Err(()));
let mut unmergeable = std::pin::pin!(unmergeable);
tokio::select! {
_ = tokio::time::sleep(FOREVER) => {},
_ = &mut unmergeable => {
panic!("unmergeable should not complete");
},
}
assert!(matches!(
&*state.value.lock().unwrap(),
&State::SenderWaitsForReceiverToConsume(_)
));
drop(receiver);
assert!(matches!(
&*state.value.lock().unwrap(),
&State::ReceiverGone
));
unmergeable.await.unwrap_err();
}
}

View File

@@ -95,14 +95,6 @@ impl InterpretedWalRecord {
&& self.metadata_record.is_none()
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
}
/// Checks if the WAL record is observed (i.e. contains only metadata
/// for observed values)
pub fn is_observed(&self) -> bool {
self.batch.is_observed()
&& self.metadata_record.is_none()
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
}
}
/// The interpreted part of the Postgres WAL record which requires metadata

View File

@@ -501,11 +501,6 @@ impl SerializedValueBatch {
!self.has_data() && self.metadata.is_empty()
}
/// Checks if the batch contains only observed values
pub fn is_observed(&self) -> bool {
!self.has_data() && !self.metadata.is_empty()
}
/// Checks if the batch contains data
///
/// Note that if this returns false, it may still contain observed values or

View File

@@ -60,7 +60,7 @@ impl Client {
) -> anyhow::Result<PagestreamClient> {
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
.client
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
.await?;
let Client {
cancel_on_client_drop,

View File

@@ -2,7 +2,7 @@ use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken;
@@ -322,15 +322,12 @@ async fn main_impl(
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}

View File

@@ -53,12 +53,12 @@ project_build_tag!(BUILD_TAG);
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
/// performance-sensitive code will avoid allocations as far as possible anyway.
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
// TODO: disabled because concurrent CPU profiles cause seg faults. See:
// https://github.com/neondatabase/neon/issues/10225.
//#[allow(non_upper_case_globals)]
//#[export_name = "malloc_conf"]
//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
const PID_FILE_NAME: &str = "pageserver.pid";

View File

@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::DEFAULT_PG_VERSION;
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
TimelineInfo,
CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
TimelineGcRequest, TimelineInfo,
};
use utils::{
auth::SwappableJwtAuth,
@@ -2052,7 +2052,15 @@ async fn timeline_compact_info_handler(
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
let res = tenant.get_scheduled_compaction_tasks(timeline_id);
let mut resp = Vec::new();
for item in res {
resp.push(CompactInfoResponse {
compact_key_range: item.compact_key_range,
compact_lsn_range: item.compact_lsn_range,
sub_compaction: item.sub_compaction,
});
}
json_response(StatusCode::OK, resp)
}
.instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))

View File

@@ -91,6 +91,15 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_read_global",
"Number of layers visited to reconstruct one key",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
)
.expect("failed to define a metric")
});
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_vectored_read_global",
@@ -1845,7 +1854,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
pub(crate) enum ComputeCommandKind {
PageStreamV3,
PageStreamV2,
Basebackup,
Fullbackup,
@@ -2329,15 +2337,13 @@ macro_rules! redo_bytes_histogram_count_buckets {
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_observed: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
pub(crate) clear_vm_bits_unknown: IntCounterVec,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
WalIngestMetrics {
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
@@ -2348,11 +2354,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_observed: register_int_counter!(
"pageserver_wal_ingest_records_observed",
"Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
@@ -2374,7 +2375,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
&["entity"],
)
.expect("failed to define a metric"),
}
});
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -3885,6 +3885,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
// histograms
[
&READ_NUM_LAYERS_VISITED,
&VEC_READ_NUM_LAYERS_VISITED,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,

View File

@@ -17,7 +17,7 @@ use pageserver_api::models::{
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
PagestreamProtocolVersion, PagestreamRequest,
PagestreamProtocolVersion,
};
use pageserver_api::shard::TenantShardId;
use postgres_backend::{
@@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::{basebackup, timed_after_cancellation};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::SlruKind;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -537,23 +537,6 @@ impl From<WaitLsnError> for QueryError {
}
}
#[derive(thiserror::Error, Debug)]
struct BatchedPageStreamError {
req: PagestreamRequest,
err: PageStreamError,
}
impl std::fmt::Display for BatchedPageStreamError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.err.fmt(f)
}
}
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
}
enum BatchedFeMessage {
Exists {
span: Span,
@@ -571,7 +554,7 @@ enum BatchedFeMessage {
span: Span,
shard: timeline::handle::Handle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
},
DbSize {
span: Span,
@@ -587,7 +570,7 @@ enum BatchedFeMessage {
},
RespondError {
span: Span,
error: BatchedPageStreamError,
error: PageStreamError,
},
}
@@ -612,15 +595,12 @@ impl BatchedFeMessage {
BatchedFeMessage::GetPage { shard, pages, .. } => (
shard,
pages.len(),
itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
),
BatchedFeMessage::RespondError { .. } => return Ok(()),
};
let throttled = tokio::select! {
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
_ = shard.cancel.cancelled() => {
return Err(QueryError::Shutdown);
}
_ = cancel.cancelled() => {
return Err(QueryError::Shutdown);
}
@@ -674,7 +654,6 @@ impl PageServerHandler {
)
}
#[allow(clippy::too_many_arguments)]
async fn pagestream_read_message<IO>(
pgb: &mut PostgresBackendReader<IO>,
tenant_id: TenantId,
@@ -682,7 +661,6 @@ impl PageServerHandler {
timeline_handles: &mut TimelineHandles,
cancel: &CancellationToken,
ctx: &RequestContext,
protocol_version: PagestreamProtocolVersion,
parent_span: Span,
) -> Result<Option<BatchedFeMessage>, QueryError>
where
@@ -717,12 +695,11 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message");
// parse request
let neon_fe_msg =
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let batched_msg = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -738,7 +715,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::Nblocks(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -754,7 +731,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::DbSize(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -770,7 +747,7 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::GetSlruSegment(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
@@ -785,23 +762,25 @@ impl PageServerHandler {
req,
}
}
PagestreamFeMessage::GetPage(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
rel,
blkno,
}) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
macro_rules! respond_error {
($error:expr) => {{
let error = BatchedFeMessage::RespondError {
span,
error: BatchedPageStreamError {
req: req.hdr,
err: $error,
},
error: $error,
};
Ok(Some(error))
}};
}
let key = rel_block_to_key(req.rel, req.blkno);
let key = rel_block_to_key(rel, blkno);
let shard = match timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Page(key))
.instrument(span.clone()) // sets `shard_id` field
@@ -835,8 +814,8 @@ impl PageServerHandler {
let effective_request_lsn = match Self::wait_or_get_last_lsn(
&shard,
req.hdr.request_lsn,
req.hdr.not_modified_since,
request_lsn,
not_modified_since,
&shard.get_latest_gc_cutoff_lsn(),
ctx,
)
@@ -852,7 +831,7 @@ impl PageServerHandler {
span,
shard,
effective_request_lsn,
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
pages: smallvec::smallvec![(rel, blkno, timer)],
}
}
};
@@ -931,7 +910,6 @@ impl PageServerHandler {
pgb_writer: &mut PostgresBackend<IO>,
batch: BatchedFeMessage,
cancel: &CancellationToken,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
@@ -939,7 +917,7 @@ impl PageServerHandler {
{
// invoke handler function
let (handler_results, span): (
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
_,
) = match batch {
BatchedFeMessage::Exists {
@@ -954,8 +932,7 @@ impl PageServerHandler {
.handle_get_rel_exists_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
.map(|msg| (msg, timer))],
span,
)
}
@@ -971,8 +948,7 @@ impl PageServerHandler {
.handle_get_nblocks_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
.map(|msg| (msg, timer))],
span,
)
}
@@ -1014,8 +990,7 @@ impl PageServerHandler {
.handle_db_size_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
.map(|msg| (msg, timer))],
span,
)
}
@@ -1031,8 +1006,7 @@ impl PageServerHandler {
.handle_get_slru_segment_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
.map(|msg| (msg, timer))],
span,
)
}
@@ -1048,7 +1022,7 @@ impl PageServerHandler {
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
for handler_result in handler_results {
let (response_msg, timer) = match handler_result {
Err(e) => match &e.err {
Err(e) => match &e {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
@@ -1067,14 +1041,13 @@ impl PageServerHandler {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e.err);
let full = utils::error::report_compact_sources(&e);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
(
PagestreamBeMessage::Error(PagestreamErrorResponse {
req: e.req,
message: e.err.to_string(),
message: e.to_string(),
}),
None, // TODO: measure errors
)
@@ -1087,9 +1060,7 @@ impl PageServerHandler {
// marshal & transmit response message
//
pgb_writer.write_message_noflush(&BeMessage::CopyData(
&response_msg.serialize(protocol_version),
))?;
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
// We purposefully don't count flush time into the timer.
//
@@ -1152,7 +1123,7 @@ impl PageServerHandler {
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
protocol_version: PagestreamProtocolVersion,
_protocol_version: PagestreamProtocolVersion,
ctx: RequestContext,
) -> Result<(), QueryError>
where
@@ -1192,7 +1163,6 @@ impl PageServerHandler {
timeline_handles,
request_span,
pipelining_config,
protocol_version,
&ctx,
)
.await
@@ -1205,7 +1175,6 @@ impl PageServerHandler {
timeline_id,
timeline_handles,
request_span,
protocol_version,
&ctx,
)
.await
@@ -1232,7 +1201,6 @@ impl PageServerHandler {
timeline_id: TimelineId,
mut timeline_handles: TimelineHandles,
request_span: Span,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -1250,7 +1218,6 @@ impl PageServerHandler {
&mut timeline_handles,
&cancel,
ctx,
protocol_version,
request_span.clone(),
)
.await;
@@ -1271,7 +1238,7 @@ impl PageServerHandler {
}
let err = self
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
.await;
match err {
Ok(()) => {}
@@ -1294,7 +1261,6 @@ impl PageServerHandler {
mut timeline_handles: TimelineHandles,
request_span: Span,
pipelining_config: PageServicePipeliningConfigPipelined,
protocol_version: PagestreamProtocolVersion,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -1392,7 +1358,6 @@ impl PageServerHandler {
&mut timeline_handles,
&cancel_batcher,
&ctx,
protocol_version,
request_span.clone(),
)
.await;
@@ -1438,14 +1403,8 @@ impl PageServerHandler {
batch
.throttle_and_record_start_processing(&self.cancel)
.await?;
self.pagesteam_handle_batched_message(
pgb_writer,
batch,
&cancel,
protocol_version,
&ctx,
)
.await?;
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
.await?;
}
}
});
@@ -1619,8 +1578,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
req.hdr.not_modified_since,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1631,7 +1590,6 @@ impl PageServerHandler {
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
req: *req,
exists,
}))
}
@@ -1646,8 +1604,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
req.hdr.not_modified_since,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1658,7 +1616,6 @@ impl PageServerHandler {
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
req: *req,
n_blocks,
}))
}
@@ -1673,8 +1630,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
req.hdr.not_modified_since,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1686,7 +1643,6 @@ impl PageServerHandler {
let db_size = total_blocks as i64 * BLCKSZ as i64;
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
req: *req,
db_size,
}))
}
@@ -1696,9 +1652,9 @@ impl PageServerHandler {
&mut self,
timeline: &Timeline,
effective_lsn: Lsn,
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
debug_assert_current_span_has_tenant_and_timeline_id();
timeline
@@ -1707,7 +1663,7 @@ impl PageServerHandler {
let results = timeline
.get_rel_page_at_lsn_batched(
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
effective_lsn,
ctx,
)
@@ -1719,20 +1675,16 @@ impl PageServerHandler {
requests
.into_iter()
.zip(results.into_iter())
.map(|(req, res)| {
.map(|((_, _, timer), res)| {
res.map(|page| {
(
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
req: req.req,
page,
}),
req.timer,
timer,
)
})
.map_err(|e| BatchedPageStreamError {
err: PageStreamError::from(e),
req: req.req.hdr,
})
.map_err(PageStreamError::from)
}),
)
}
@@ -1747,8 +1699,8 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
req.hdr.not_modified_since,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
@@ -1759,7 +1711,7 @@ impl PageServerHandler {
let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
Ok(PagestreamBeMessage::GetSlruSegment(
PagestreamGetSlruSegmentResponse { req: *req, segment },
PagestreamGetSlruSegmentResponse { segment },
))
}
@@ -1954,7 +1906,6 @@ struct FullBackupCmd {
struct PageStreamCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
protocol_version: PagestreamProtocolVersion,
}
/// `lease lsn tenant timeline lsn`
@@ -1975,7 +1926,7 @@ enum PageServiceCmd {
}
impl PageStreamCmd {
fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result<Self> {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 2 {
bail!(
@@ -1990,7 +1941,6 @@ impl PageStreamCmd {
Ok(Self {
tenant_id,
timeline_id,
protocol_version,
})
}
}
@@ -2128,14 +2078,7 @@ impl PageServiceCmd {
bail!("cannot parse query: {query}")
};
match cmd.to_ascii_lowercase().as_str() {
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(
other,
PagestreamProtocolVersion::V2,
)?)),
"pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse(
other,
PagestreamProtocolVersion::V3,
)?)),
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
"lease" => {
@@ -2217,21 +2160,25 @@ where
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id,
protocol_version,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
let command_kind = match protocol_version {
PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
};
COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
.await?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
}
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
@@ -2410,8 +2357,7 @@ mod tests {
cmd,
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id,
protocol_version: PagestreamProtocolVersion::V2,
timeline_id
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();

View File

@@ -627,7 +627,7 @@ impl Timeline {
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;
let cmp = match self
let cmp = self
.is_latest_commit_timestamp_ge_than(
search_timestamp,
Lsn(mid * 8),
@@ -635,16 +635,7 @@ impl Timeline {
&mut found_larger,
ctx,
)
.await
{
Ok(res) => res,
Err(PageReconstructError::MissingKey(e)) => {
warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
// Return that we didn't find any requests smaller than the LSN, and logging the error.
return Ok(LsnForTimestamp::Past(min_lsn));
}
Err(e) => return Err(e),
};
.await?;
if cmp {
high = mid;
@@ -652,7 +643,6 @@ impl Timeline {
low = mid + 1;
}
}
// If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
// so the LSN of the last commit record before or at `search_timestamp`.
// Remove one from `low` to get `t`.

View File

@@ -21,7 +21,6 @@ use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::CompactInfoResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::TimelineState;
@@ -38,17 +37,20 @@ use remote_timeline_client::manifest::{
};
use remote_timeline_client::UploadQueueNotReadyError;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::compaction::GcCompactionQueue;
use timeline::compaction::GcCompactJob;
use timeline::compaction::ScheduledCompactionTask;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -344,8 +346,10 @@ pub struct Tenant {
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
/// Scheduled gc-compaction tasks.
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
/// a manual gc-compaction from the manual compaction API.
scheduled_compaction_tasks:
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2035,7 +2039,7 @@ impl Tenant {
) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline");
// We activate the timeline below manually, so this must be called on an active tenant.
// We activate the timeline below manually, so this must be called on an active timeline.
// We expect callers of this function to ensure this.
match self.current_state() {
TenantState::Activating { .. }
@@ -2992,35 +2996,113 @@ impl Tenant {
if has_pending_l0_compaction_task {
Some(true)
} else {
let queue = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(timeline_id).cloned()
let mut has_pending_scheduled_compaction_task;
let next_scheduled_compaction_task = {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
if !tline_pending_tasks.is_empty() {
info!(
"{} tasks left in the compaction schedule queue",
tline_pending_tasks.len()
);
}
let next_task = tline_pending_tasks.pop_front();
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
next_task
} else {
has_pending_scheduled_compaction_task = false;
None
}
};
if let Some(queue) = queue {
let has_pending_tasks = queue
.iteration(cancel, ctx, &self.gc_block, timeline)
.await?;
Some(has_pending_tasks)
} else {
Some(false)
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
{
if !next_scheduled_compaction_task
.options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
} else if next_scheduled_compaction_task.options.sub_compaction {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs: Vec<GcCompactJob> = timeline
.gc_compaction_split_jobs(
GcCompactJob::from_compact_options(
next_scheduled_compaction_task.options.clone(),
),
next_scheduled_compaction_task
.options
.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
} else {
has_pending_scheduled_compaction_task = true;
let jobs_len = jobs.len();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
for (idx, job) in jobs.into_iter().enumerate() {
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
// until we do further refactors to allow directly call `compact_with_gc`.
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
if job.dry_run {
flags |= CompactFlags::DryRun;
}
let options = CompactOptions {
flags,
sub_compaction: false,
compact_key_range: Some(job.compact_key_range.into()),
compact_lsn_range: Some(job.compact_lsn_range.into()),
sub_compaction_max_job_size_mb: None,
};
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
ScheduledCompactionTask {
options,
// The last job in the queue sends the signal and releases the gc guard
result_tx: next_scheduled_compaction_task
.result_tx
.take(),
gc_block: next_scheduled_compaction_task
.gc_block
.take(),
}
} else {
ScheduledCompactionTask {
options,
result_tx: None,
gc_block: None,
}
});
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
}
} else {
let _ = timeline
.compact_with_options(
cancel,
next_scheduled_compaction_task.options,
ctx,
)
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
.await?;
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
// TODO: we can send compaction statistics in the future
tx.send(()).ok();
}
}
}
Some(has_pending_scheduled_compaction_task)
}
} else {
None
};
has_pending_task |= pending_task_left.unwrap_or(false);
if pending_task_left == Some(false) && *can_offload {
pausable_failpoint!("before-timeline-auto-offload");
match offload_timeline(self, timeline)
offload_timeline(self, timeline)
.instrument(info_span!("offload_timeline", %timeline_id))
.await
{
Err(OffloadError::NotArchived) => {
// Ignore this, we likely raced with unarchival
Ok(())
}
other => other,
}?;
.await?;
}
}
@@ -3033,32 +3115,34 @@ impl Tenant {
}
/// Cancel scheduled compaction tasks
pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
pub(crate) fn cancel_scheduled_compaction(
&self,
timeline_id: TimelineId,
) -> Vec<ScheduledCompactionTask> {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(q) = guard.get_mut(&timeline_id) {
q.cancel_scheduled();
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
current_tline_pending_tasks.into_iter().collect()
} else {
Vec::new()
}
}
pub(crate) fn get_scheduled_compaction_tasks(
&self,
timeline_id: TimelineId,
) -> Vec<CompactInfoResponse> {
let res = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(&timeline_id).map(|q| q.remaining_jobs())
};
let Some((running, remaining)) = res else {
return Vec::new();
};
let mut result = Vec::new();
if let Some((id, running)) = running {
result.extend(running.into_compact_info_resp(id, true));
}
for (id, job) in remaining {
result.extend(job.into_compact_info_resp(id, false));
}
result
) -> Vec<CompactOptions> {
use itertools::Itertools;
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard
.get(&timeline_id)
.map(|tline_pending_tasks| {
tline_pending_tasks
.iter()
.map(|x| x.options.clone())
.collect_vec()
})
.unwrap_or_default()
}
/// Schedule a compaction task for a timeline.
@@ -3067,12 +3151,20 @@ impl Tenant {
timeline_id: TimelineId,
options: CompactOptions,
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
let gc_guard = match self.gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
bail!("cannot run gc-compaction because gc is blocked: {}", e);
}
};
let (tx, rx) = tokio::sync::oneshot::channel();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let q = guard
.entry(timeline_id)
.or_insert_with(|| Arc::new(GcCompactionQueue::new()));
q.schedule_manual_compaction(options, Some(tx));
let tline_pending_tasks = guard.entry(timeline_id).or_default();
tline_pending_tasks.push_back(ScheduledCompactionTask {
options,
result_tx: Some(tx),
gc_block: Some(gc_guard),
});
Ok(rx)
}
@@ -5682,7 +5774,7 @@ mod tests {
use bytes::{Bytes, BytesMut};
use hex_literal::hex;
use itertools::Itertools;
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use pageserver_api::value::Value;
@@ -7741,18 +7833,7 @@ mod tests {
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
let base_inherited_key_child =
Key::from_hex("610000000033333333444444445500000001").unwrap();
let base_inherited_key_nonexist =
Key::from_hex("610000000033333333444444445500000002").unwrap();
let base_inherited_key_overwrite =
Key::from_hex("610000000033333333444444445500000003").unwrap();
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
let tline = tenant
.create_test_timeline_with_layers(
@@ -7761,18 +7842,7 @@ mod tests {
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // delta layers
vec![(
Lsn(0x20),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 1a"),
),
(base_key, test_img("metadata key 1")),
(base_key_overwrite, test_img("metadata key overwrite 1b")),
],
)], // image layers
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
)
.await?;
@@ -7786,18 +7856,7 @@ mod tests {
Vec::new(), // delta layers
vec![(
Lsn(0x30),
vec![
(
base_inherited_key_child,
test_img("metadata inherited key 2"),
),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 2a"),
),
(base_key_child, test_img("metadata key 2")),
(base_key_overwrite, test_img("metadata key overwrite 2b")),
],
vec![(base_key_child, test_img("metadata key 2"))],
)], // image layers
Lsn(0x30),
)
@@ -7819,26 +7878,6 @@ mod tests {
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 1b"))
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
Some(test_img("metadata inherited key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 1a"))
);
// test vectored get on child timeline
assert_eq!(
@@ -7853,82 +7892,6 @@ mod tests {
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
Some(test_img("metadata inherited key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
Some(test_img("metadata inherited key 2"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 2b"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 2a"))
);
// test vectored scan on parent timeline
let mut reconstruct_state = ValuesReconstructState::new();
let res = tline
.get_vectored_impl(
KeySpace::single(Key::metadata_key_range()),
lsn,
&mut reconstruct_state,
&ctx,
)
.await?;
assert_eq!(
res.into_iter()
.map(|(k, v)| (k, v.unwrap()))
.collect::<Vec<_>>(),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 1a")
),
(base_key, test_img("metadata key 1")),
(base_key_overwrite, test_img("metadata key overwrite 1b")),
]
);
// test vectored scan on child timeline
let mut reconstruct_state = ValuesReconstructState::new();
let res = child
.get_vectored_impl(
KeySpace::single(Key::metadata_key_range()),
lsn,
&mut reconstruct_state,
&ctx,
)
.await?;
assert_eq!(
res.into_iter()
.map(|(k, v)| (k, v.unwrap()))
.collect::<Vec<_>>(),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_child,
test_img("metadata inherited key 2")
),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 2a")
),
(base_key_child, test_img("metadata key 2")),
(base_key_overwrite, test_img("metadata key overwrite 2b")),
]
);
Ok(())
}

View File

@@ -11,7 +11,7 @@
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, TenantConfigPatch};
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
.map(humantime),
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),

View File

@@ -84,17 +84,17 @@ impl Value {
fn to_u64(self) -> u64 {
let b = &self.0;
((b[0] as u64) << 32)
| ((b[1] as u64) << 24)
| ((b[2] as u64) << 16)
| ((b[3] as u64) << 8)
(b[0] as u64) << 32
| (b[1] as u64) << 24
| (b[2] as u64) << 16
| (b[3] as u64) << 8
| b[4] as u64
}
fn to_blknum(self) -> u32 {
let b = &self.0;
assert!(b[0] == 0x80);
((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
(b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
}
}

View File

@@ -304,15 +304,6 @@ pub enum WaitCompletionError {
#[derive(Debug, thiserror::Error)]
#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
pub struct UploadQueueNotReadyError;
#[derive(Debug, thiserror::Error)]
pub enum ShutdownIfArchivedError {
#[error(transparent)]
NotInitialized(NotInitialized),
#[error("timeline is not archived")]
NotArchived,
}
/// Behavioral modes that enable seamless live migration.
///
/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
@@ -825,55 +816,6 @@ impl RemoteTimelineClient {
Ok(need_wait)
}
/// Shuts the timeline client down, but only if the timeline is archived.
///
/// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
/// same lock to prevent races between unarchival and offloading: unarchival requires the
/// upload queue to be initialized, and leaves behind an upload queue where either dirty
/// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
/// queue.
pub(crate) async fn shutdown_if_archived(
self: &Arc<Self>,
) -> Result<(), ShutdownIfArchivedError> {
{
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard
.initialized_mut()
.map_err(ShutdownIfArchivedError::NotInitialized)?;
match (
upload_queue.dirty.archived_at.is_none(),
upload_queue.clean.0.archived_at.is_none(),
) {
// The expected case: the timeline is archived and we don't want to unarchive
(false, false) => {}
(true, false) => {
tracing::info!("can't shut down timeline: timeline slated for unarchival");
return Err(ShutdownIfArchivedError::NotArchived);
}
(dirty_archived, true) => {
tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
return Err(ShutdownIfArchivedError::NotArchived);
}
}
// Set the shutting_down flag while the guard from the archival check is held.
// This prevents a race with unarchival, as initialized_mut will not return
// an upload queue from this point.
// Also launch the queued tasks like shutdown() does.
if !upload_queue.shutting_down {
upload_queue.shutting_down = true;
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
// this operation is not counted similar to Barrier
self.launch_queued_tasks(upload_queue);
}
}
self.shutdown().await;
Ok(())
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
self: &Arc<Self>,

View File

@@ -12,7 +12,7 @@ pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
let is_sparse_key = key.is_sparse();
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => {

View File

@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
///
/// Layout:
/// - 1 bit: `will_init`
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct IndexEntry(u64);

View File

@@ -27,7 +27,7 @@ use pageserver_api::{
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
key::{
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
SPARSE_RANGE,
NON_INHERITED_SPARSE_RANGE,
},
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
// stalling compaction.
keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
});
// Keyspace is fully retrieved
@@ -3242,11 +3242,7 @@ impl Timeline {
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
// space. If that's not the case, we had at least one key encounter a gap in the image layer
// and stop the search as a result of that.
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
// Do not fire missing key error for sparse keys.
removed.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
if !removed.is_empty() {
break Some(removed);
}
@@ -3261,21 +3257,6 @@ impl Timeline {
timeline = &*timeline_owned;
};
// Remove sparse keys from the keyspace so that it doesn't fire errors.
let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
let mut missing_keyspace = missing_keyspace;
missing_keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
if missing_keyspace.is_empty() {
None
} else {
Some(missing_keyspace)
}
} else {
None
};
if let Some(missing_keyspace) = missing_keyspace {
return Err(GetVectoredError::MissingKey(MissingKeyError {
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */

View File

@@ -4,7 +4,7 @@
//!
//! The old legacy algorithm is implemented directly in `timeline.rs`.
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
use std::sync::Arc;
@@ -16,12 +16,10 @@ use super::{
use anyhow::{anyhow, bail, Context};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::key::KEY_SIZE;
use pageserver_api::keyspace::ShardedRange;
use pageserver_api::models::CompactInfoResponse;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use serde::Serialize;
use tokio_util::sync::CancellationToken;
@@ -32,7 +30,6 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
use crate::page_cache;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -66,284 +63,16 @@ use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub struct GcCompactionJobId(pub usize);
impl std::fmt::Display for GcCompactionJobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone)]
pub enum GcCompactionQueueItem {
Manual(CompactOptions),
SubCompactionJob(CompactOptions),
#[allow(dead_code)]
UpdateL2Lsn(Lsn),
Notify(GcCompactionJobId),
}
impl GcCompactionQueueItem {
pub fn into_compact_info_resp(
self,
id: GcCompactionJobId,
running: bool,
) -> Option<CompactInfoResponse> {
match self {
GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
compact_key_range: options.compact_key_range,
compact_lsn_range: options.compact_lsn_range,
sub_compaction: options.sub_compaction,
running,
job_id: id.0,
}),
GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
compact_key_range: options.compact_key_range,
compact_lsn_range: options.compact_lsn_range,
sub_compaction: options.sub_compaction,
running,
job_id: id.0,
}),
GcCompactionQueueItem::UpdateL2Lsn(_) => None,
GcCompactionQueueItem::Notify(_) => None,
}
}
}
struct GcCompactionQueueInner {
running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
last_id: GcCompactionJobId,
}
impl GcCompactionQueueInner {
fn next_id(&mut self) -> GcCompactionJobId {
let id = self.last_id;
self.last_id = GcCompactionJobId(id.0 + 1);
id
}
}
/// A structure to store gc_compaction jobs.
pub struct GcCompactionQueue {
/// All items in the queue, and the currently-running job.
inner: std::sync::Mutex<GcCompactionQueueInner>,
/// Ensure only one thread is consuming the queue.
consumer_lock: tokio::sync::Mutex<()>,
}
impl GcCompactionQueue {
pub fn new() -> Self {
GcCompactionQueue {
inner: std::sync::Mutex::new(GcCompactionQueueInner {
running: None,
queued: VecDeque::new(),
notify: HashMap::new(),
gc_guards: HashMap::new(),
last_id: GcCompactionJobId(0),
}),
consumer_lock: tokio::sync::Mutex::new(()),
}
}
pub fn cancel_scheduled(&self) {
let mut guard = self.inner.lock().unwrap();
guard.queued.clear();
guard.notify.clear();
guard.gc_guards.clear();
}
/// Schedule a manual compaction job.
pub fn schedule_manual_compaction(
&self,
options: CompactOptions,
notify: Option<tokio::sync::oneshot::Sender<()>>,
) -> GcCompactionJobId {
let mut guard = self.inner.lock().unwrap();
let id = guard.next_id();
guard
.queued
.push_back((id, GcCompactionQueueItem::Manual(options)));
if let Some(notify) = notify {
guard.notify.insert(id, notify);
}
info!("scheduled compaction job id={}", id);
id
}
/// Trigger an auto compaction.
#[allow(dead_code)]
pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
/// Notify the caller the job has finished and unblock GC.
fn notify_and_unblock(&self, id: GcCompactionJobId) {
info!("compaction job id={} finished", id);
let mut guard = self.inner.lock().unwrap();
if let Some(blocking) = guard.gc_guards.remove(&id) {
drop(blocking)
}
if let Some(tx) = guard.notify.remove(&id) {
let _ = tx.send(());
}
}
async fn handle_sub_compaction(
&self,
id: GcCompactionJobId,
options: CompactOptions,
timeline: &Arc<Timeline>,
gc_block: &GcBlock,
) -> Result<(), CompactionError> {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs: Vec<GcCompactJob> = timeline
.gc_compaction_split_jobs(
GcCompactJob::from_compact_options(options.clone()),
options.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
self.notify_and_unblock(id);
} else {
let gc_guard = match gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
return Err(CompactionError::Other(anyhow!(
"cannot run gc-compaction because gc is blocked: {}",
e
)));
}
};
let jobs_len = jobs.len();
let mut pending_tasks = Vec::new();
for job in jobs {
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
// until we do further refactors to allow directly call `compact_with_gc`.
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
if job.dry_run {
flags |= CompactFlags::DryRun;
}
let options = CompactOptions {
flags,
sub_compaction: false,
compact_key_range: Some(job.compact_key_range.into()),
compact_lsn_range: Some(job.compact_lsn_range.into()),
sub_compaction_max_job_size_mb: None,
};
pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
}
pending_tasks.push(GcCompactionQueueItem::Notify(id));
{
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
let mut tasks = Vec::new();
for task in pending_tasks {
let id = guard.next_id();
tasks.push((id, task));
}
tasks.reverse();
for item in tasks {
guard.queued.push_front(item);
}
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
}
Ok(())
}
/// Take a job from the queue and process it. Returns if there are still pending tasks.
pub async fn iteration(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
gc_block: &GcBlock,
timeline: &Arc<Timeline>,
) -> Result<bool, CompactionError> {
let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
let has_pending_tasks;
let (id, item) = {
let mut guard = self.inner.lock().unwrap();
let Some((id, item)) = guard.queued.pop_front() else {
return Ok(false);
};
guard.running = Some((id, item.clone()));
has_pending_tasks = !guard.queued.is_empty();
(id, item)
};
match item {
GcCompactionQueueItem::Manual(options) => {
if !options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
} else if options.sub_compaction {
self.handle_sub_compaction(id, options, timeline, gc_block)
.await?;
} else {
let gc_guard = match gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
return Err(CompactionError::Other(anyhow!(
"cannot run gc-compaction because gc is blocked: {}",
e
)));
}
};
{
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
}
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
self.notify_and_unblock(id);
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
}
GcCompactionQueueItem::Notify(id) => {
self.notify_and_unblock(id);
}
GcCompactionQueueItem::UpdateL2Lsn(_) => {
unreachable!()
}
}
{
let mut guard = self.inner.lock().unwrap();
guard.running = None;
}
Ok(has_pending_tasks)
}
#[allow(clippy::type_complexity)]
pub fn remaining_jobs(
&self,
) -> (
Option<(GcCompactionJobId, GcCompactionQueueItem)>,
VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
) {
let guard = self.inner.lock().unwrap();
(guard.running.clone(), guard.queued.clone())
}
#[allow(dead_code)]
pub fn remaining_jobs_num(&self) -> usize {
let guard = self.inner.lock().unwrap();
guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
}
/// A scheduled compaction task.
pub(crate) struct ScheduledCompactionTask {
/// It's unfortunate that we need to store a compact options struct here because the only outer
/// API we can call here is `compact_with_options` which does a few setup calls before starting the
/// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
pub options: CompactOptions,
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
pub gc_block: Option<gc_block::Guard>,
}
/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will

View File

@@ -194,9 +194,7 @@ impl DeleteTimelineFlow {
super::debug_assert_current_span_has_tenant_and_timeline_id();
let allow_offloaded_children = false;
let set_stopping = true;
let (timeline, mut guard) =
Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
guard.mark_in_progress()?;
@@ -336,7 +334,6 @@ impl DeleteTimelineFlow {
tenant: &Tenant,
timeline_id: TimelineId,
allow_offloaded_children: bool,
set_stopping: bool,
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -392,10 +389,8 @@ impl DeleteTimelineFlow {
}
};
if set_stopping {
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
}
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
}
Ok((timeline, delete_lock_guard))

View File

@@ -1,11 +1,10 @@
use std::sync::Arc;
use pageserver_api::models::{TenantState, TimelineState};
use pageserver_api::models::TenantState;
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
#[derive(thiserror::Error, Debug)]
@@ -37,29 +36,28 @@ pub(crate) async fn offload_timeline(
tracing::info!("offloading archived timeline");
let allow_offloaded_children = true;
let set_stopping = false;
let (timeline, guard) = DeleteTimelineFlow::prepare(
tenant,
timeline.timeline_id,
allow_offloaded_children,
set_stopping,
)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let (timeline, guard) =
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
tracing::error!("timeline already offloaded, but given timeline object");
return Ok(());
};
match timeline.remote_client.shutdown_if_archived().await {
Ok(()) => {}
Err(ShutdownIfArchivedError::NotInitialized(_)) => {
// Either the timeline is being deleted, the operation is being retried, or we are shutting down.
// Don't return cancelled here to keep it idempotent.
let is_archived = timeline.is_archived();
match is_archived {
Some(true) => (),
Some(false) => {
tracing::warn!("tried offloading a non-archived timeline");
return Err(OffloadError::NotArchived);
}
None => {
// This is legal: calls to this function can race with the timeline shutting down
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
return Err(OffloadError::Cancelled);
}
Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
}
timeline.set_state(TimelineState::Stopping);
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Reload).await;

View File

@@ -319,11 +319,27 @@ pub(super) async fn handle_walreceiver_connection(
return Ok(());
}
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
let status_update = match replication_message {
ReplicationMessage::RawInterpretedWalRecords(raw) => {
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
// This is the end LSN of the raw WAL from which the records
// were interpreted.
@@ -364,23 +380,31 @@ pub(super) async fn handle_walreceiver_connection(
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
{
modification.commit(&ctx).await?;
uncommitted_records = 0;
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
let local_next_record_lsn = interpreted.next_record_lsn;
if interpreted.is_observed() {
WAL_INGEST.records_observed.inc();
}
walingest
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
.with_context(|| {
format!("could not ingest record at {local_next_record_lsn}")
})?;
if !ingested {
tracing::debug!(
"ingest: filtered out record @ LSN {local_next_record_lsn}"
);
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
uncommitted_records += 1;
// FIXME: this cannot be made pausable_failpoint without fixing the
@@ -394,8 +418,13 @@ pub(super) async fn handle_walreceiver_connection(
|| modification.approx_pending_bytes()
> DatadirModification::MAX_PENDING_BYTES
{
modification.commit(&ctx).await?;
uncommitted_records = 0;
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
}
@@ -403,7 +432,7 @@ pub(super) async fn handle_walreceiver_connection(
// need to advance last record LSN on all shards. If we've not ingested the latest
// record, then set the LSN of the modification past it. This way all shards
// advance their last record LSN at the same time.
let needs_last_record_lsn_advance = match next_record_lsn {
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
Some(lsn) if lsn > modification.get_lsn() => {
modification.set_lsn(lsn).unwrap();
true
@@ -413,7 +442,13 @@ pub(super) async fn handle_walreceiver_connection(
if uncommitted_records > 0 || needs_last_record_lsn_advance {
// Commit any uncommitted records
modification.commit(&ctx).await?;
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
if !caught_up && streaming_lsn >= end_of_wal {
@@ -434,21 +469,6 @@ pub(super) async fn handle_walreceiver_connection(
}
ReplicationMessage::XLogData(xlog_data) => {
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
// Pass the WAL data to the decoder, and see if we can decode
// more records as a result.
let data = xlog_data.data();

View File

@@ -308,7 +308,7 @@ impl WalIngest {
epoch -= 1;
}
Ok(((epoch as u64) << 32) | xid as u64)
Ok((epoch as u64) << 32 | xid as u64)
}
async fn ingest_clear_vm_bits(

26
pgxn/hnsw/Makefile Normal file
View File

@@ -0,0 +1,26 @@
EXTENSION = hnsw
EXTVERSION = 0.1.0
MODULE_big = hnsw
DATA = $(wildcard *--*.sql)
OBJS = hnsw.o hnswalg.o
TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
# For auto-vectorization:
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
PG_CFLAGS += -O3
PG_CXXFLAGS += -O3 -std=c++11
PG_LDFLAGS += -lstdc++
all: $(EXTENSION)--$(EXTVERSION).sql
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
dist:
mkdir -p dist
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master

25
pgxn/hnsw/README.md Normal file
View File

@@ -0,0 +1,25 @@
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
This ANN extension of Postgres is based
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
<br>
Dmitry Baranchuk, Artem Babenko, Yury Malkov
# Postgres extension
HNSW index is hold in memory (built on demand) and it's maxial size is limited
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
described in the article).
# Example of usage:
```
create extension hnsw;
create table embeddings(id integer primary key, payload real[]);
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
```

29
pgxn/hnsw/hnsw--0.1.0.sql Normal file
View File

@@ -0,0 +1,29 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-- functions
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
COMMUTATOR = '<->'
);
-- access method
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-- opclasses
CREATE OPERATOR CLASS knn_ops
DEFAULT FOR TYPE real[] USING hnsw AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;

590
pgxn/hnsw/hnsw.c Normal file
View File

@@ -0,0 +1,590 @@
#include "postgres.h"
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "access/relation.h"
#include "access/reloptions.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "storage/bufmgr.h"
#include "utils/guc.h"
#include "utils/selfuncs.h"
#include <math.h>
#include <float.h>
#include "hnsw.h"
PG_MODULE_MAGIC;
typedef struct {
int32 vl_len_; /* varlena header (do not touch directly!) */
int dims;
int maxelements;
int efConstruction;
int efSearch;
int M;
} HnswOptions;
static relopt_kind hnsw_relopt_kind;
typedef struct {
HierarchicalNSW* hnsw;
size_t curr;
size_t n_results;
ItemPointer results;
} HnswScanOpaqueData;
typedef HnswScanOpaqueData* HnswScanOpaque;
typedef struct {
Oid relid;
uint32 status;
HierarchicalNSW* hnsw;
} HnswHashEntry;
#define SH_PREFIX hnsw_index
#define SH_ELEMENT_TYPE HnswHashEntry
#define SH_KEY_TYPE Oid
#define SH_KEY relid
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->relid)
#define SH_HASH_KEY(tb, key) (key)
#define SH_EQUAL(tb, a, b) ((a) == (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
#define INDEX_HASH_SIZE 11
#define DEFAULT_EF_SEARCH 64
PGDLLEXPORT void _PG_init(void);
static hnsw_index_hash *hnsw_indexes;
/*
* Initialize index options and variables
*/
void
_PG_init(void)
{
hnsw_relopt_kind = add_reloption_kind();
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
100, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
16, 1, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
64, 1, INT_MAX, AccessExclusiveLock);
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
}
static void
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return;
array = DatumGetArrayTypeP(values[0]);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, tid, sizeof(*tid));
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
}
static void
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
{
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
Assert(indexInfo->ii_NumIndexAttrs == 1);
table_index_build_scan(heapRel, indexRel, indexInfo,
true, true, hnsw_build_callback, (void *) hnsw, NULL);
}
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
static void
hnsw_check_available_memory(Size requested)
{
size_t total;
if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
if ((Size)NBuffers*BLCKSZ + requested >= total)
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
requested, total - (Size)NBuffers*BLCKSZ);
}
#else
#include <sys/sysinfo.h>
static void
hnsw_check_available_memory(Size requested)
{
struct sysinfo si;
Size total;
if (sysinfo(&si) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
total = si.totalram*si.mem_unit;
if ((Size)NBuffers*BLCKSZ + requested >= total)
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
requested, total - (Size)NBuffers*BLCKSZ);
}
#endif
static HierarchicalNSW*
hnsw_get_index(Relation indexRel, Relation heapRel)
{
HierarchicalNSW* hnsw;
Oid indexoid = RelationGetRelid(indexRel);
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
if (entry == NULL)
{
size_t dims, maxelements;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t size_data_per_element;
size_t data_size;
dsm_handle handle = indexoid << 1; /* make it even */
void* impl_private = NULL;
void* mapped_address = NULL;
Size mapped_size = 0;
Size shmem_size;
bool exists = true;
bool found;
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
}
dims = opts->dims;
maxelements = opts->maxelements;
M = opts->M;
maxM = M * 2;
data_size = dims * sizeof(coord_t);
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
hnsw_check_available_memory(shmem_size);
/* first try to attach to existed index */
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* index doesn't exists: try to create it */
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* We can do it under shared lock, so some other backend may
* try to initialize index. If create is failed because index already
* created by somebody else, then try to attach to it once again
*/
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, ERROR))
{
return NULL;
}
}
else
{
exists = false;
}
}
Assert(mapped_size == shmem_size);
hnsw = (HierarchicalNSW*)mapped_address;
if (!exists)
{
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
hnsw_populate(hnsw, indexRel, heapRel);
}
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
Assert(!found);
entry->hnsw = hnsw;
}
else
{
hnsw = entry->hnsw;
}
return hnsw;
}
/*
* Start or restart an index scan
*/
static IndexScanDesc
hnsw_beginscan(Relation index, int nkeys, int norderbys)
{
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
so->hnsw = hnsw_get_index(index, heap);
relation_close(heap, NoLock);
so->curr = 0;
so->n_results = 0;
so->results = NULL;
scan->opaque = so;
return scan;
}
/*
* Start or restart an index scan
*/
static void
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
{
pfree(so->results);
so->results = NULL;
}
so->curr = 0;
if (orderbys && scan->numberOfOrderBys > 0)
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
}
/*
* Fetch the next tuple in the given scan
*/
static bool
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
/*
* Index can be used to scan backward, but Postgres doesn't support
* backward scan on operators
*/
Assert(ScanDirectionIsForward(dir));
if (so->curr == 0)
{
Datum value;
ArrayType* array;
int n_items;
size_t n_results;
label_t* results;
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
/* Safety check */
if (scan->orderByData == NULL)
elog(ERROR, "cannot scan HNSW index without order");
/* No items will match if null */
if (scan->orderByData->sk_flags & SK_ISNULL)
return false;
value = scan->orderByData->sk_argument;
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(so->hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(so->hnsw));
}
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
elog(ERROR, "HNSW index search failed");
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
so->n_results = n_results;
for (size_t i = 0; i < n_results; i++)
{
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
}
free(results);
}
if (so->curr >= so->n_results)
{
return false;
}
else
{
scan->xs_heaptid = so->results[so->curr++];
scan->xs_recheckorderby = false;
return true;
}
}
/*
* End a scan and release resources
*/
static void
hnsw_endscan(IndexScanDesc scan)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
pfree(so->results);
pfree(so);
scan->opaque = NULL;
}
/*
* Estimate the cost of an index scan
*/
static void
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
Cost *indexStartupCost, Cost *indexTotalCost,
Selectivity *indexSelectivity, double *indexCorrelation
,double *indexPages
)
{
GenericCosts costs;
/* Never use index without order */
if (path->indexorderbys == NULL)
{
*indexStartupCost = DBL_MAX;
*indexTotalCost = DBL_MAX;
*indexSelectivity = 0;
*indexCorrelation = 0;
*indexPages = 0;
return;
}
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/* Startup cost and total cost are same */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
*indexPages = costs.numIndexPages;
}
/*
* Parse and validate the reloptions
*/
static bytea *
hnsw_options(Datum reloptions, bool validate)
{
static const relopt_parse_elt tab[] = {
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
};
return (bytea *) build_reloptions(reloptions, validate,
hnsw_relopt_kind,
sizeof(HnswOptions),
tab, lengthof(tab));
}
/*
* Validate catalog entries for the specified operator class
*/
static bool
hnsw_validate(Oid opclassoid)
{
return true;
}
/*
* Build the index for a logged table
*/
static IndexBuildResult *
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
return result;
}
/*
* Insert a tuple into the index
*/
static bool
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
Relation heap, IndexUniqueCheck checkUnique,
bool indexUnchanged,
IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
Datum value;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return false;
/* Detoast value */
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, heap_tid, sizeof(*heap_tid));
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
elog(ERROR, "HNSW index insert failed");
return true;
}
/*
* Build the index for an unlogged table
*/
static void
hnsw_buildempty(Relation index)
{
/* index will be constructed on dema nd when accessed */
}
/*
* Clean up after a VACUUM operation
*/
static IndexBulkDeleteResult *
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
Relation rel = info->index;
if (stats == NULL)
return NULL;
stats->num_pages = RelationGetNumberOfBlocks(rel);
return stats;
}
/*
* Bulk delete tuples from the index
*/
static IndexBulkDeleteResult *
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
return stats;
}
/*
* Define index handler
*
* See https://www.postgresql.org/docs/current/index-api.html
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
Datum
hnsw_handler(PG_FUNCTION_ARGS)
{
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
amroutine->amstrategies = 0;
amroutine->amsupport = 0;
amroutine->amoptsprocnum = 0;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = true;
amroutine->amcanbackward = false; /* can change direction mid-scan */
amroutine->amcanunique = false;
amroutine->amcanmulticol = false;
amroutine->amoptionalkey = true;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
amroutine->amstorage = false;
amroutine->amclusterable = false;
amroutine->ampredlocks = false;
amroutine->amcanparallel = false;
amroutine->amcaninclude = false;
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
amroutine->amkeytype = InvalidOid;
/* Interface functions */
amroutine->ambuild = hnsw_build;
amroutine->ambuildempty = hnsw_buildempty;
amroutine->aminsert = hnsw_insert;
amroutine->ambulkdelete = hnsw_bulkdelete;
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
amroutine->amcostestimate = hnsw_costestimate;
amroutine->amoptions = hnsw_options;
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
amroutine->ambuildphasename = NULL;
amroutine->amvalidate = hnsw_validate;
amroutine->amadjustmembers = NULL;
amroutine->ambeginscan = hnsw_beginscan;
amroutine->amrescan = hnsw_rescan;
amroutine->amgettuple = hnsw_gettuple;
amroutine->amgetbitmap = NULL;
amroutine->amendscan = hnsw_endscan;
amroutine->ammarkpos = NULL;
amroutine->amrestrpos = NULL;
/* Interface functions to support parallel index scans */
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
PG_RETURN_POINTER(amroutine);
}
/*
* Get the L2 distance between vectors
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
Datum
l2_distance(PG_FUNCTION_ARGS)
{
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
dist_t distance = 0.0;
dist_t diff;
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
if (a_dim != b_dim)
{
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
}
for (int i = 0; i < a_dim; i++)
{
diff = ax[i] - bx[i];
distance += diff * diff;
}
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
}

4
pgxn/hnsw/hnsw.control Normal file
View File

@@ -0,0 +1,4 @@
comment = '** Deprecated ** Please use pg_embedding instead'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true

15
pgxn/hnsw/hnsw.h Normal file
View File

@@ -0,0 +1,15 @@
#pragma once
typedef float coord_t;
typedef float dist_t;
typedef uint32_t idx_t;
typedef uint64_t label_t;
typedef struct HierarchicalNSW HierarchicalNSW;
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
int hnsw_dimensions(HierarchicalNSW* hnsw);
size_t hnsw_count(HierarchicalNSW* hnsw);
size_t hnsw_sizeof(void);

379
pgxn/hnsw/hnswalg.cpp Normal file
View File

@@ -0,0 +1,379 @@
#include "hnswalg.h"
#if defined(__GNUC__)
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
#else
#define PORTABLE_ALIGN32 __declspec(align(32))
#define PREFETCH(addr,hint)
#endif
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
{
dim = dim_;
data_size = dim * sizeof(coord_t);
efConstruction = efConstruction_;
maxelements = maxelements_;
M = M_;
maxM = maxM_;
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
offset_data = size_links_level0;
offset_label = offset_data + data_size;
enterpoint_node = 0;
cur_element_count = 0;
#ifdef __x86_64__
use_avx2 = __builtin_cpu_supports("avx2");
#endif
}
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
{
std::vector<uint32_t> visited;
visited.resize((cur_element_count + 31) >> 5);
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
topResults.emplace(dist, enterpoint_node);
candidateSet.emplace(-dist, enterpoint_node);
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
dist_t lowerBound = dist;
while (!candidateSet.empty())
{
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
if (-curr_el_pair.first > lowerBound)
break;
candidateSet.pop();
idx_t curNodeNum = curr_el_pair.second;
idx_t* data = get_linklist0(curNodeNum);
size_t size = *data++;
PREFETCH(getDataByInternalId(*data), 0);
for (size_t j = 0; j < size; ++j) {
size_t tnum = *(data + j);
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
visited[tnum >> 5] |= 1 << (tnum & 31);
dist = fstdistfunc(point, getDataByInternalId(tnum));
if (topResults.top().first > dist || topResults.size() < ef) {
candidateSet.emplace(-dist, tnum);
PREFETCH(get_linklist0(candidateSet.top().second), 0);
topResults.emplace(dist, tnum);
if (topResults.size() > ef)
topResults.pop();
lowerBound = topResults.top().first;
}
}
}
}
return topResults;
}
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
{
if (topResults.size() < NN)
return;
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
std::vector<std::pair<dist_t, idx_t>> returnlist;
while (topResults.size() > 0) {
resultSet.emplace(-topResults.top().first, topResults.top().second);
topResults.pop();
}
while (resultSet.size()) {
if (returnlist.size() >= NN)
break;
std::pair<dist_t, idx_t> curen = resultSet.top();
dist_t dist_to_query = -curen.first;
resultSet.pop();
bool good = true;
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
getDataByInternalId(curen.second));
if (curdist < dist_to_query) {
good = false;
break;
}
}
if (good) returnlist.push_back(curen);
}
for (std::pair<dist_t, idx_t> elem : returnlist)
topResults.emplace(-elem.first, elem.second);
}
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
{
getNeighborsByHeuristic(topResults, M);
std::vector<idx_t> res;
res.reserve(M);
while (topResults.size() > 0) {
res.push_back(topResults.top().second);
topResults.pop();
}
{
idx_t* data = get_linklist0(cur_c);
if (*data)
throw std::runtime_error("Should be blank");
*data++ = res.size();
for (size_t idx = 0; idx < res.size(); idx++) {
if (data[idx])
throw std::runtime_error("Should be blank");
data[idx] = res[idx];
}
}
for (size_t idx = 0; idx < res.size(); idx++) {
if (res[idx] == cur_c)
throw std::runtime_error("Connection to the same element");
size_t resMmax = maxM;
idx_t *ll_other = get_linklist0(res[idx]);
idx_t sz_link_list_other = *ll_other;
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
throw std::runtime_error("Bad sz_link_list_other");
if (sz_link_list_other < resMmax) {
idx_t *data = ll_other + 1;
data[sz_link_list_other] = cur_c;
*ll_other = sz_link_list_other + 1;
} else {
// finding the "weakest" element to replace it with the new one
idx_t *data = ll_other + 1;
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
// Heuristic:
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
candidates.emplace(d_max, cur_c);
for (size_t j = 0; j < sz_link_list_other; j++)
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
getNeighborsByHeuristic(candidates, resMmax);
size_t indx = 0;
while (!candidates.empty()) {
data[indx] = candidates.top().second;
candidates.pop();
indx++;
}
*ll_other = indx;
}
}
}
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
{
if (cur_element_count >= maxelements) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}
idx_t cur_c = cur_element_count++;
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
memcpy(getDataByInternalId(cur_c), point, data_size);
memcpy(getExternalLabel(cur_c), &label, sizeof label);
// Do nothing for the first element
if (cur_c != 0) {
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
mutuallyConnectNewElement(point, cur_c, topResults);
}
};
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
{
std::priority_queue<std::pair<dist_t, label_t>> topResults;
auto topCandidates = searchBaseLayer(query, k);
while (topCandidates.size() > k) {
topCandidates.pop();
}
while (!topCandidates.empty()) {
std::pair<dist_t, idx_t> rez = topCandidates.top();
label_t label;
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
topCandidates.pop();
}
return topResults;
};
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
{
dist_t distance = 0.0;
for (size_t i = 0; i < n; i++)
{
dist_t diff = x[i] - y[i];
distance += diff * diff;
}
return distance;
}
#ifdef __x86_64__
#include <immintrin.h>
__attribute__((target("avx2")))
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (x < pEnd1) {
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
return (res);
}
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (x < pEnd1) {
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}
_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return res;
}
#endif
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
{
#ifndef __x86_64__
return fstdistfunc_scalar(x, y, dim);
#else
if(use_avx2)
return fstdistfunc_avx2(x, y, dim);
return fstdistfunc_sse(x, y, dim);
#endif
}
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
{
try
{
auto result = hnsw->searchKnn(point, efSearch);
size_t nResults = result.size();
*results = (label_t*)malloc(nResults*sizeof(label_t));
for (size_t i = nResults; i-- != 0;)
{
(*results)[i] = result.top().second;
result.pop();
}
*n_results = nResults;
return true;
}
catch (std::exception& x)
{
return false;
}
}
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
{
try
{
hnsw->addPoint(point, label);
return true;
}
catch (std::exception& x)
{
fprintf(stderr, "Catch %s\n", x.what());
return false;
}
}
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
{
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
}
int hnsw_dimensions(HierarchicalNSW* hnsw)
{
return (int)hnsw->dim;
}
size_t hnsw_count(HierarchicalNSW* hnsw)
{
return hnsw->cur_element_count;
}
size_t hnsw_sizeof(void)
{
return sizeof(HierarchicalNSW);
}

69
pgxn/hnsw/hnswalg.h Normal file
View File

@@ -0,0 +1,69 @@
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unordered_map>
#include <unordered_set>
#include <map>
#include <cmath>
#include <queue>
#include <stdexcept>
extern "C" {
#include "hnsw.h"
}
struct HierarchicalNSW
{
size_t maxelements;
size_t cur_element_count;
idx_t enterpoint_node;
size_t dim;
size_t data_size;
size_t offset_data;
size_t offset_label;
size_t size_data_per_element;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t efConstruction;
#ifdef __x86_64__
bool use_avx2;
#endif
char data_level0_memory[0]; // varying size
public:
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
~HierarchicalNSW();
inline coord_t *getDataByInternalId(idx_t internal_id) const {
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
}
inline idx_t *get_linklist0(idx_t internal_id) const {
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
}
inline label_t *getExternalLabel(idx_t internal_id) const {
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
}
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
void addPoint(const coord_t *point, label_t label);
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
};

View File

@@ -0,0 +1,28 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
QUERY PLAN
--------------------------------------------------------------------
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
Order By: (val <-> '{3,3,3}'::real[])
(2 rows)
SELECT * FROM t ORDER BY val <-> array[3,3,3];
val
---------
{1,2,3}
{1,2,4}
{1,1,1}
{0,0,0}
(4 rows)
SELECT COUNT(*) FROM t;
count
-------
5
(1 row)
DROP TABLE t;

View File

@@ -0,0 +1,13 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT COUNT(*) FROM t;
DROP TABLE t;

View File

@@ -556,9 +556,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
switch (neon_protocol_version)
{
case 3:
pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline);
break;
case 2:
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
break;
@@ -1138,9 +1135,9 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
2, /* min */
3, /* max */
2, /* use protocol version 2 */
2, /* min */
2, /* max */
PGC_SU_BACKEND,
0, /* no flags required */
NULL, NULL, NULL);

View File

@@ -44,15 +44,10 @@ typedef enum
T_NeonGetSlruSegmentResponse,
} NeonMessageTag;
typedef uint64 NeonRequestId;
/* base struct for c-style inheritance */
typedef struct
{
NeonMessageTag tag;
NeonRequestId reqid;
XLogRecPtr lsn;
XLogRecPtr not_modified_since;
} NeonMessage;
#define messageTag(m) (((const NeonMessage *)(m))->tag)
@@ -72,7 +67,6 @@ typedef enum {
SLRU_MULTIXACT_OFFSETS
} SlruKind;
/*--
* supertype of all the Neon*Request structs below.
*
@@ -93,37 +87,37 @@ typedef enum {
*
* These structs describe the V2 of these requests. (The old now-defunct V1
* protocol contained just one LSN and a boolean 'latest' flag.)
*
* V3 version of protocol adds request ID to all requests. This request ID is also included in response
* as well as other fields from requests, which allows to verify that we receive response for our request.
* We copy fields from request to response to make checking more reliable: request ID is formed from process ID
* and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
*/
typedef NeonMessage NeonRequest;
typedef struct
{
NeonMessageTag tag;
XLogRecPtr lsn;
XLogRecPtr not_modified_since;
} NeonRequest;
typedef struct
{
NeonRequest hdr;
NeonRequest req;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonExistsRequest;
typedef struct
{
NeonRequest hdr;
NeonRequest req;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonNblocksRequest;
typedef struct
{
NeonRequest hdr;
NeonRequest req;
Oid dbNode;
} NeonDbSizeRequest;
typedef struct
{
NeonRequest hdr;
NeonRequest req;
NRelFileInfo rinfo;
ForkNumber forknum;
BlockNumber blkno;
@@ -131,29 +125,32 @@ typedef struct
typedef struct
{
NeonRequest hdr;
SlruKind kind;
int segno;
NeonRequest req;
SlruKind kind;
int segno;
} NeonGetSlruSegmentRequest;
/* supertype of all the Neon*Response structs below */
typedef NeonMessage NeonResponse;
typedef struct
{
NeonMessageTag tag;
} NeonResponse;
typedef struct
{
NeonExistsRequest req;
NeonMessageTag tag;
bool exists;
} NeonExistsResponse;
typedef struct
{
NeonNblocksRequest req;
NeonMessageTag tag;
uint32 n_blocks;
} NeonNblocksResponse;
typedef struct
{
NeonGetPageRequest req;
NeonMessageTag tag;
char page[FLEXIBLE_ARRAY_MEMBER];
} NeonGetPageResponse;
@@ -161,21 +158,21 @@ typedef struct
typedef struct
{
NeonDbSizeRequest req;
NeonMessageTag tag;
int64 db_size;
} NeonDbSizeResponse;
typedef struct
{
NeonResponse req;
NeonMessageTag tag;
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
* message */
} NeonErrorResponse;
typedef struct
{
NeonGetSlruSegmentRequest req;
int n_blocks;
NeonMessageTag tag;
int n_blocks;
char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
} NeonGetSlruSegmentResponse;

View File

@@ -120,9 +120,6 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block
static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
static uint32 local_request_counter;
#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
/*
* Prefetch implementation:
*
@@ -191,11 +188,15 @@ typedef struct PrefetchRequest
uint8 status; /* see PrefetchStatus for valid values */
uint8 flags; /* see PrefetchRequestFlags */
neon_request_lsns request_lsns;
NeonRequestId reqid;
NeonResponse *response; /* may be null */
uint64 my_ring_index;
} PrefetchRequest;
StaticAssertDecl(sizeof(PrefetchRequest) == 64,
"We prefer to have a power-of-2 size for this struct. Please"
" try to find an alternative solution before reaching to"
" increase the expected size here");
/* prefetch buffer lookup hash table */
typedef struct PrfHashEntry
@@ -364,7 +365,6 @@ compact_prefetch_buffers(void)
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->reqid = source_slot->reqid;
target_slot->request_lsns = source_slot->request_lsns;
target_slot->my_ring_index = empty_ring_index;
@@ -798,8 +798,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
NeonGetPageRequest request = {
.hdr.tag = T_NeonGetPageRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.req.tag = T_NeonGetPageRequest,
/* lsn and not_modified_since are filled in below */
.rinfo = BufTagGetNRelFileInfo(slot->buftag),
.forknum = slot->buftag.forkNum,
@@ -808,16 +807,14 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
Assert(mySlotNo == MyPState->ring_unused);
slot->reqid = request.hdr.reqid;
if (force_request_lsns)
slot->request_lsns = *force_request_lsns;
else
neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum, slot->buftag.blockNum,
&slot->request_lsns, 1, NULL);
request.hdr.lsn = slot->request_lsns.request_lsn;
request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
request.req.lsn = slot->request_lsns.request_lsn;
request.req.not_modified_since = slot->request_lsns.not_modified_since;
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -1105,12 +1102,6 @@ Retry:
return min_ring_index;
}
static bool
equal_requests(NeonRequest* a, NeonRequest* b)
{
return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
}
/*
* Note: this function can get canceled and use a long jump to the next catch
@@ -1193,10 +1184,6 @@ nm_pack_request(NeonRequest *msg)
initStringInfo(&s);
pq_sendbyte(&s, msg->tag);
if (neon_protocol_version >= 3)
{
pq_sendint64(&s, msg->reqid);
}
pq_sendint64(&s, msg->lsn);
pq_sendint64(&s, msg->not_modified_since);
@@ -1274,16 +1261,8 @@ NeonResponse *
nm_unpack_response(StringInfo s)
{
NeonMessageTag tag = pq_getmsgbyte(s);
NeonResponse resp_hdr = {0}; /* make valgrind happy */
NeonResponse *resp = NULL;
resp_hdr.tag = tag;
if (neon_protocol_version >= 3)
{
resp_hdr.reqid = pq_getmsgint64(s);
resp_hdr.lsn = pq_getmsgint64(s);
resp_hdr.not_modified_since = pq_getmsgint64(s);
}
switch (tag)
{
/* pagestore -> pagestore_client */
@@ -1291,14 +1270,7 @@ nm_unpack_response(StringInfo s)
{
NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->tag = tag;
msg_resp->exists = pq_getmsgbyte(s);
pq_getmsgend(s);
@@ -1310,14 +1282,7 @@ nm_unpack_response(StringInfo s)
{
NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->tag = tag;
msg_resp->n_blocks = pq_getmsgint(s, 4);
pq_getmsgend(s);
@@ -1330,20 +1295,12 @@ nm_unpack_response(StringInfo s)
NeonGetPageResponse *msg_resp;
msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
if (neon_protocol_version >= 3)
{
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
msg_resp->req.forknum = pq_getmsgbyte(s);
msg_resp->req.blkno = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->tag = tag;
/* XXX: should be varlena */
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
pq_getmsgend(s);
Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
Assert(msg_resp->tag == T_NeonGetPageResponse);
resp = (NeonResponse *) msg_resp;
break;
@@ -1353,11 +1310,7 @@ nm_unpack_response(StringInfo s)
{
NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
if (neon_protocol_version >= 3)
{
msg_resp->req.dbNode = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
msg_resp->tag = tag;
msg_resp->db_size = pq_getmsgint64(s);
pq_getmsgend(s);
@@ -1375,7 +1328,7 @@ nm_unpack_response(StringInfo s)
msglen = strlen(msgtext);
msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
msg_resp->req = resp_hdr;
msg_resp->tag = tag;
memcpy(msg_resp->message, msgtext, msglen + 1);
pq_getmsgend(s);
@@ -1386,17 +1339,9 @@ nm_unpack_response(StringInfo s)
case T_NeonGetSlruSegmentResponse:
{
NeonGetSlruSegmentResponse *msg_resp;
int n_blocks;
msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
if (neon_protocol_version >= 3)
{
msg_resp->req.kind = pq_getmsgbyte(s);
msg_resp->req.segno = pq_getmsgint(s, 4);
}
msg_resp->req.hdr = resp_hdr;
n_blocks = pq_getmsgint(s, 4);
int n_blocks = pq_getmsgint(s, 4);
msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
msg_resp->tag = tag;
msg_resp->n_blocks = n_blocks;
memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
pq_getmsgend(s);
@@ -1441,8 +1386,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1454,8 +1399,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1468,8 +1413,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1479,8 +1424,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -1491,8 +1436,8 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfoChar(&s, '}');
break;
}
@@ -2367,64 +2312,39 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
{
NeonExistsRequest request = {
.hdr.tag = T_NeonExistsRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.req.tag = T_NeonExistsRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forkNum
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonExistsResponse:
{
NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
exists_resp->req.forknum != request.forknum)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
}
}
exists = exists_resp->exists;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
resp->reqid,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
}
pfree(resp);
}
switch (resp->tag)
{
case T_NeonExistsResponse:
exists = ((NeonExistsResponse *) resp)->exists;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
}
pfree(resp);
return exists;
}
@@ -3032,43 +2952,15 @@ Retry:
switch (resp->tag)
{
case T_NeonGetPageResponse:
{
NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
if (neon_protocol_version >= 3)
{
if (resp->reqid != slot->reqid ||
resp->lsn != slot->request_lsns.request_lsn ||
resp->not_modified_since != slot->request_lsns.not_modified_since ||
!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
getpage_resp->req.forknum != forkNum ||
getpage_resp->req.blkno != base_blockno + i)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
}
}
memcpy(buffer, getpage_resp->page, BLCKSZ);
memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
lfc_write(rinfo, forkNum, blockno, buffer);
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (resp->reqid != slot->reqid ||
resp->lsn != slot->request_lsns.request_lsn ||
resp->not_modified_since != slot->request_lsns.not_modified_since)
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, blockno, RelFileInfoFmt(rinfo),
forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
@@ -3551,72 +3443,47 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
{
NeonNblocksRequest request = {
.hdr.tag = T_NeonNblocksRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.req.tag = T_NeonNblocksRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forknum,
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonNblocksResponse:
{
NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
relsize_resp->req.forknum != forknum)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
}
}
n_blocks = relsize_resp->n_blocks;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
resp->reqid,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
n_blocks);
pfree(resp);
}
switch (resp->tag)
{
case T_NeonNblocksResponse:
n_blocks = ((NeonNblocksResponse *) resp)->n_blocks;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
n_blocks);
pfree(resp);
return n_blocks;
}
@@ -3636,64 +3503,40 @@ neon_dbsize(Oid dbNode)
{
NeonDbSizeRequest request = {
.hdr.tag = T_NeonDbSizeRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsns.request_lsn,
.hdr.not_modified_since = request_lsns.not_modified_since,
.req.tag = T_NeonDbSizeRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.dbNode = dbNode,
};
resp = page_server_request(&request);
switch (resp->tag)
{
case T_NeonDbSizeResponse:
{
NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
dbsize_resp->req.dbNode != dbNode)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
}
}
db_size = dbsize_resp->db_size;
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
resp->reqid,
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
pfree(resp);
}
switch (resp->tag)
{
case T_NeonDbSizeResponse:
db_size = ((NeonDbSizeResponse *) resp)->db_size;
break;
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
pfree(resp);
return db_size;
}
@@ -4025,17 +3868,16 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
return -1;
request = (NeonGetSlruSegmentRequest) {
.hdr.tag = T_NeonGetSlruSegmentRequest,
.hdr.reqid = GENERATE_REQUEST_ID(),
.hdr.lsn = request_lsn,
.hdr.not_modified_since = not_modified_since,
.req.tag = T_NeonGetSlruSegmentRequest,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.kind = kind,
.segno = segno
};
do
{
while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
consume_prefetch_responses();
@@ -4045,38 +3887,14 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
switch (resp->tag)
{
case T_NeonGetSlruSegmentResponse:
{
NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr) ||
slru_resp->req.kind != kind ||
slru_resp->req.segno != segno)
{
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
}
}
n_blocks = slru_resp->n_blocks;
memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
break;
}
case T_NeonErrorResponse:
if (neon_protocol_version >= 3)
{
if (!equal_requests(resp, &request.hdr))
{
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
}
}
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
resp->reqid,
errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
kind,
segno,
LSN_FORMAT_ARGS(request_lsn)),
@@ -4215,9 +4033,8 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
NeonResponse *response;
NeonNblocksResponse *nbresponse;
NeonNblocksRequest request = {
.hdr = (NeonRequest) {
.req = (NeonRequest) {
.tag = T_NeonNblocksRequest,
.reqid = GENERATE_REQUEST_ID(),
.lsn = end_recptr,
.not_modified_since = end_recptr,
},

8
poetry.lock generated
View File

@@ -2028,13 +2028,13 @@ openapi-schema-validator = ">=0.4.2,<0.5.0"
[[package]]
name = "packaging"
version = "24.2"
version = "23.0"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
{file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
{file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
]
[[package]]

View File

@@ -106,7 +106,6 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
signature = "2"
ecdsa = "0.16"
p256 = { version = "0.13", features = ["jwk"] }
ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
rsa = "0.9"
workspace_hack.workspace = true
@@ -121,4 +120,4 @@ rcgen.workspace = true
rstest.workspace = true
walkdir.workspace = true
rand_distr = "0.4"
tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
tokio-postgres.workspace = true

View File

@@ -1,5 +1,3 @@
use std::fmt;
use async_trait::async_trait;
use postgres_client::config::SslMode;
use pq_proto::BeMessage as Be;
@@ -7,19 +5,15 @@ use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, info_span};
use super::{ComputeCredentialKeys, ControlPlaneApi};
use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
use super::ComputeCredentialKeys;
use crate::auth::IpPattern;
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestContext;
use crate::control_plane::client::cplane_proxy_v1;
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::error::{ReportableError, UserFacingError};
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions;
use crate::stream::PqStream;
use crate::types::RoleName;
use crate::{auth, compute, waiters};
#[derive(Debug, Error)]
@@ -37,13 +31,6 @@ pub(crate) enum ConsoleRedirectError {
#[derive(Debug)]
pub struct ConsoleRedirectBackend {
console_uri: reqwest::Url,
api: cplane_proxy_v1::NeonControlPlaneClient,
}
impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "NeonControlPlaneClient")
}
}
impl UserFacingError for ConsoleRedirectError {
@@ -84,24 +71,9 @@ pub(crate) fn new_psql_session_id() -> String {
hex::encode(rand::random::<[u8; 8]>())
}
#[async_trait]
impl BackendIpAllowlist for ConsoleRedirectBackend {
async fn get_allowed_ips(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> auth::Result<Vec<auth::IpPattern>> {
self.api
.get_allowed_ips_and_secret(ctx, user_info)
.await
.map(|(ips, _)| ips.0.clone())
.map_err(|e| e.into())
}
}
impl ConsoleRedirectBackend {
pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
Self { console_uri, api }
pub fn new(console_uri: reqwest::Url) -> Self {
Self { console_uri }
}
pub(crate) async fn authenticate(
@@ -109,16 +81,10 @@ impl ConsoleRedirectBackend {
ctx: &RequestContext,
auth_config: &'static AuthenticationConfig,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<(
ConsoleRedirectNodeInfo,
ComputeUserInfo,
Option<Vec<IpPattern>>,
)> {
) -> auth::Result<(ConsoleRedirectNodeInfo, Option<Vec<IpPattern>>)> {
authenticate(ctx, auth_config, &self.console_uri, client)
.await
.map(|(node_info, user_info, ip_allowlist)| {
(ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist)
})
.map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist))
}
}
@@ -143,7 +109,7 @@ async fn authenticate(
auth_config: &'static AuthenticationConfig,
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<(NodeInfo, ComputeUserInfo, Option<Vec<IpPattern>>)> {
) -> auth::Result<(NodeInfo, Option<Vec<IpPattern>>)> {
ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
// registering waiter can fail if we get unlucky with rng.
@@ -198,15 +164,8 @@ async fn authenticate(
let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
config.dbname(&db_info.dbname).user(&db_info.user);
let user: RoleName = db_info.user.into();
let user_info = ComputeUserInfo {
endpoint: db_info.aux.endpoint_id.as_str().into(),
user: user.clone(),
options: NeonOptions::default(),
};
ctx.set_dbname(db_info.dbname.into());
ctx.set_user(user);
ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone());
info!("woken up a compute node");
@@ -229,7 +188,6 @@ async fn authenticate(
config,
aux: db_info.aux,
},
user_info,
db_info.allowed_ips,
))
}

View File

@@ -44,18 +44,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
}
#[derive(Clone)]
pub(crate) struct StaticAuthRules(pub Vec<AuthRule>);
impl FetchAuthRules for StaticAuthRules {
async fn fetch_auth_rules(
&self,
_ctx: &RequestContext,
_endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
Ok(self.0.clone())
}
}
#[derive(Error, Debug)]
pub(crate) enum FetchAuthRulesError {
#[error(transparent)]
@@ -65,7 +53,7 @@ pub(crate) enum FetchAuthRulesError {
RoleJwksNotConfigured,
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone)]
pub(crate) struct AuthRule {
pub(crate) id: String,
pub(crate) jwks_url: url::Url,

Some files were not shown because too many files have changed in this diff Show More