mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 11:40:37 +00:00
Compare commits
76 Commits
problame/f
...
conrad/mem
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfb2e3c178 | ||
|
|
0a34084ba5 | ||
|
|
b33047df7e | ||
|
|
1c5477619f | ||
|
|
40f5b3e8df | ||
|
|
791b5d736b | ||
|
|
96bcfba79e | ||
|
|
8e95455aef | ||
|
|
f3ef60d236 | ||
|
|
8f627ea0ab | ||
|
|
6a353c33e3 | ||
|
|
64d0008389 | ||
|
|
53a05e8ccb | ||
|
|
62c0152e6b | ||
|
|
7fef4435c1 | ||
|
|
43fd5b218b | ||
|
|
29ee273d78 | ||
|
|
8b0f2efa57 | ||
|
|
b309cbc6e9 | ||
|
|
f0c0733a64 | ||
|
|
8862e7c4bf | ||
|
|
b7fc5a2fe0 | ||
|
|
4559ba79b6 | ||
|
|
5dd24c7ad8 | ||
|
|
f2828bbe19 | ||
|
|
fb796229bf | ||
|
|
267fb49908 | ||
|
|
e2982ed3ec | ||
|
|
9e154a8130 | ||
|
|
79d72c94e8 | ||
|
|
80e5771c67 | ||
|
|
1178f6fe7c | ||
|
|
8b18d8b31b | ||
|
|
3e4cbaed67 | ||
|
|
c71aea0223 | ||
|
|
87915df2fa | ||
|
|
caca08fe78 | ||
|
|
0c99f16c60 | ||
|
|
dd7fff655a | ||
|
|
809633903d | ||
|
|
5c934efb29 | ||
|
|
5c9c3b3317 | ||
|
|
921a4f2009 | ||
|
|
eb93c3e3c6 | ||
|
|
7a7ab2a1d1 | ||
|
|
ff526a1051 | ||
|
|
9a2456bea5 | ||
|
|
a456e818af | ||
|
|
3e6fdb0aa6 | ||
|
|
f8d3f86f58 | ||
|
|
f67a8a173e | ||
|
|
2288efae66 | ||
|
|
4fedcbc0ac | ||
|
|
eb830fa547 | ||
|
|
a203f9829a | ||
|
|
42ab34dc36 | ||
|
|
30b877074c | ||
|
|
f18cc808f0 | ||
|
|
d14d8271b8 | ||
|
|
fecb707b19 | ||
|
|
296c9190b2 | ||
|
|
a5fe67f361 | ||
|
|
ee7bb1a667 | ||
|
|
9bba31bf68 | ||
|
|
380d167b7c | ||
|
|
cb991fba42 | ||
|
|
4566b12a22 | ||
|
|
63ca084696 | ||
|
|
379259bdd7 | ||
|
|
3300207523 | ||
|
|
a0a7733b5a | ||
|
|
f4245403b3 | ||
|
|
a8db7ebffb | ||
|
|
154f6dc59c | ||
|
|
15f633922a | ||
|
|
c34d36d8a2 |
@@ -30,6 +30,7 @@ workspace-members = [
|
||||
"vm_monitor",
|
||||
# All of these exist in libs and are not usually built independently.
|
||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
||||
"alloc-metrics",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
|
||||
@@ -27,4 +27,4 @@
|
||||
!storage_controller/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!build_tools/patches
|
||||
!build-tools/patches
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -31,6 +31,7 @@ config-variables:
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- PREWARM_PGBENCH_SIZE
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
|
||||
@@ -176,7 +176,13 @@ runs:
|
||||
fi
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
# We don't use code coverage for regression tests (the step is disabled),
|
||||
# so there's no need to collect it.
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
# cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=()
|
||||
# Explicitly set LLVM_PROFILE_FILE to /dev/null to avoid writing *.profraw files
|
||||
export LLVM_PROFILE_FILE=/dev/null
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
@@ -150,7 +150,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
@@ -162,7 +162,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Build all
|
||||
# Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
|
||||
|
||||
72
.github/workflows/benchmarking.yml
vendored
72
.github/workflows/benchmarking.yml
vendored
@@ -219,6 +219,7 @@ jobs:
|
||||
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
|
||||
--ignore test_runner/performance/test_perf_many_relations.py
|
||||
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
|
||||
--ignore test_runner/performance/test_lfc_prewarm.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -410,6 +411,77 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
prewarm-test:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 17
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Run prewarm benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_lfc_prewarm.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
|
||||
DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('build-tools.Dockerfile',
|
||||
${{ hashFiles('build-tools/Dockerfile',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
run: |
|
||||
echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT}
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
|
||||
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
file: build-tools.Dockerfile
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
|
||||
43
.github/workflows/build_and_test.yml
vendored
43
.github/workflows/build_and_test.yml
vendored
@@ -87,22 +87,27 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
lint-openapi-spec:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
lint-yamls:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- run: make -C compute manifest-schema-validation
|
||||
- run: make lint-openapi-spec
|
||||
|
||||
check-codestyle-python:
|
||||
@@ -217,28 +222,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
validate-compute-manifest:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
||||
with:
|
||||
node-version: '24'
|
||||
|
||||
- name: Validate manifest against schema
|
||||
run: |
|
||||
make -C compute manifest-schema-validation
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ meta, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -29,3 +29,6 @@ docker-compose/docker-compose-parallel.yml
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
# Node
|
||||
**/node_modules/
|
||||
|
||||
8
.gitmodules
vendored
8
.gitmodules
vendored
@@ -1,16 +1,16 @@
|
||||
[submodule "vendor/postgres-v14"]
|
||||
path = vendor/postgres-v14
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_14_STABLE_neon
|
||||
[submodule "vendor/postgres-v15"]
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "vendor/postgres-v16"]
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
[submodule "vendor/postgres-v17"]
|
||||
path = vendor/postgres-v17
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_17_STABLE_neon
|
||||
|
||||
151
Cargo.lock
generated
151
Cargo.lock
generated
@@ -61,6 +61,17 @@ dependencies = [
|
||||
"equator",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "alloc-metrics"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"measured",
|
||||
"metrics",
|
||||
"thread_local",
|
||||
"tikv-jemallocator",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.16"
|
||||
@@ -1330,6 +1341,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hostname-validator",
|
||||
@@ -1338,7 +1350,6 @@ dependencies = [
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"nix 0.30.1",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
@@ -1872,6 +1883,7 @@ dependencies = [
|
||||
"diesel_derives",
|
||||
"itoa",
|
||||
"serde_json",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2533,6 +2545,18 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.2+wasi-0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
@@ -2891,13 +2915,13 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"bytes",
|
||||
"camino",
|
||||
"fail",
|
||||
"futures",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"once_cell",
|
||||
"pprof",
|
||||
"regex",
|
||||
@@ -3606,9 +3630,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.10"
|
||||
version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
|
||||
checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
@@ -3758,7 +3782,7 @@ dependencies = [
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
@@ -3846,29 +3870,17 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"lock_api",
|
||||
"nix 0.30.1",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"rustc-hash 2.1.1",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neon_failpoint"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"either",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"serde",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -4313,6 +4325,7 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"pprof",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -4374,6 +4387,7 @@ dependencies = [
|
||||
"either",
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
"futures",
|
||||
"hashlink",
|
||||
"hex",
|
||||
@@ -4388,7 +4402,6 @@ dependencies = [
|
||||
"jsonwebtoken",
|
||||
"md5",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"nix 0.30.1",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
@@ -5299,6 +5312,7 @@ name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"alloc-metrics",
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"assert-json-diff",
|
||||
@@ -5306,6 +5320,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
"aws-config",
|
||||
"aws-credential-types",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.22.1",
|
||||
@@ -5345,6 +5360,7 @@ dependencies = [
|
||||
"itoa",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"json",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -5361,7 +5377,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -5372,7 +5388,7 @@ dependencies = [
|
||||
"reqwest-tracing",
|
||||
"rsa",
|
||||
"rstest",
|
||||
"rustc-hash 1.1.0",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustls 0.23.27",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
@@ -5465,6 +5481,12 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5489,6 +5511,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -5509,6 +5541,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -5527,6 +5569,15 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
@@ -5537,6 +5588,16 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -6208,6 +6269,7 @@ dependencies = [
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
"http 1.1.0",
|
||||
@@ -6217,7 +6279,7 @@ dependencies = [
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -6225,6 +6287,7 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
@@ -6269,7 +6332,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
@@ -6904,6 +6967,7 @@ dependencies = [
|
||||
"diesel",
|
||||
"diesel-async",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"governor",
|
||||
"hex",
|
||||
@@ -6916,7 +6980,6 @@ dependencies = [
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
@@ -6945,6 +7008,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -7008,6 +7072,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"tokio",
|
||||
@@ -7279,12 +7344,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
|
||||
version = "1.1.9"
|
||||
source = "git+https://github.com/conradludgate/thread_local-rs?branch=no-tls-destructor-get#f9ca3d375745c14a632ae3ffe6a7a646dc8421a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7577,6 +7640,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8181,7 +8245,7 @@ dependencies = [
|
||||
"const_format",
|
||||
"criterion",
|
||||
"diatomic-waker",
|
||||
"either",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -8189,7 +8253,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"neon_failpoint",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"pem",
|
||||
@@ -8217,6 +8280,7 @@ dependencies = [
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
@@ -8359,6 +8423,15 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
@@ -8716,6 +8789,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
@@ -8818,7 +8900,6 @@ dependencies = [
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
"zstd",
|
||||
"zstd-safe",
|
||||
|
||||
13
Cargo.toml
13
Cargo.toml
@@ -21,7 +21,6 @@ members = [
|
||||
"workspace_hack",
|
||||
"libs/compute_api",
|
||||
"libs/http-utils",
|
||||
"libs/neon_failpoint",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
"libs/postgres_ffi_types",
|
||||
@@ -98,6 +97,7 @@ diatomic-waker = { version = "0.2.3" }
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
futures = "0.3"
|
||||
@@ -130,6 +130,7 @@ jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
lock_api = "0.4.13"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
@@ -165,7 +166,7 @@ reqwest-middleware = "0.4"
|
||||
reqwest-retry = "0.7"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustc-hash = "2.1.1"
|
||||
rustls = { version = "0.23.16", default-features = false }
|
||||
rustls-pemfile = "2"
|
||||
rustls-pki-types = "1.11"
|
||||
@@ -194,6 +195,7 @@ sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
thread_local = "1.1.9"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
@@ -201,7 +203,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-stream = { version = "0.1", features = ["sync"] }
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
@@ -252,13 +254,13 @@ azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git"
|
||||
azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
|
||||
## Local libraries
|
||||
alloc-metrics = { version = "0.1", path = "./libs/alloc-metrics/" }
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neon_failpoint = { version = "0.1", path = "./libs/neon_failpoint/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
@@ -302,6 +304,9 @@ tonic-build = "0.13.1"
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
# Needed to fix a bug in alloc-metrics
|
||||
thread_local = { git = "https://github.com/conradludgate/thread_local-rs", branch = "no-tls-destructor-get" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
[profile.release]
|
||||
|
||||
14
Makefile
14
Makefile
@@ -2,7 +2,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
# Where to install Postgres, default is ./pg_install, maybe useful for package
|
||||
# managers.
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install
|
||||
|
||||
# Supported PostgreSQL versions
|
||||
POSTGRES_VERSIONS = v17 v16 v15 v14
|
||||
@@ -14,7 +14,7 @@ POSTGRES_VERSIONS = v17 v16 v15 v14
|
||||
# it is derived from BUILD_TYPE.
|
||||
|
||||
# All intermediate build artifacts are stored here.
|
||||
BUILD_DIR := build
|
||||
BUILD_DIR := $(ROOT_PROJECT_DIR)/build
|
||||
|
||||
ICU_PREFIX_DIR := /usr/local/icu
|
||||
|
||||
@@ -212,7 +212,7 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
|
||||
INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
|
||||
-C $(BUILD_DIR)/neon-v17 \
|
||||
-C $(BUILD_DIR)/pgxn-v17/neon \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||
|
||||
|
||||
@@ -220,11 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
build-tools/node_modules: build-tools/package.json
|
||||
cd build-tools && $(if $(CI),npm ci,npm install)
|
||||
touch build-tools/node_modules
|
||||
|
||||
.PHONY: lint-openapi-spec
|
||||
lint-openapi-spec:
|
||||
lint-openapi-spec: build-tools/node_modules
|
||||
# operation-2xx-response: pageserver timeline delete returns 404 on success
|
||||
find . -iname "openapi_spec.y*ml" -exec\
|
||||
docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
|
||||
npx --prefix=build-tools/ redocly\
|
||||
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
|
||||
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
|
||||
lint {} \+
|
||||
|
||||
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
@@ -188,6 +188,12 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install node
|
||||
ENV NODE_VERSION=24
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \
|
||||
&& apt install -y nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install docker
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
|
||||
@@ -311,14 +317,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} --locked && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} --locked && \
|
||||
cargo install cargo-deny --version ${CARGO_DENY_VERSION} --locked && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} --locked && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} --locked && \
|
||||
cargo install cargo-chef --version ${CARGO_CHEF_VERSION} --locked && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} --locked \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
cargo install rustfilt --locked --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --locked --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --locked --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \
|
||||
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
|
||||
cargo install diesel_cli --locked --version ${CARGO_DIESEL_CLI_VERSION} \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
3189
build-tools/package-lock.json
generated
Normal file
3189
build-tools/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
8
build-tools/package.json
Normal file
8
build-tools/package.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"name": "build-tools",
|
||||
"private": true,
|
||||
"devDependencies": {
|
||||
"@redocly/cli": "1.34.4",
|
||||
"@sourcemeta/jsonschema": "10.0.0"
|
||||
}
|
||||
}
|
||||
@@ -50,9 +50,9 @@ jsonnetfmt-format:
|
||||
jsonnetfmt --in-place $(jsonnet_files)
|
||||
|
||||
.PHONY: manifest-schema-validation
|
||||
manifest-schema-validation: node_modules
|
||||
node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
|
||||
manifest-schema-validation: ../build-tools/node_modules
|
||||
npx --prefix=../build-tools/ jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
|
||||
|
||||
node_modules: package.json
|
||||
npm install
|
||||
touch node_modules
|
||||
../build-tools/node_modules: ../build-tools/package.json
|
||||
cd ../build-tools && $(if $(CI),npm ci,npm install)
|
||||
touch ../build-tools/node_modules
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#
|
||||
# build-tools: This contains Rust compiler toolchain and other tools needed at compile
|
||||
# time. This is also used for the storage builds. This image is defined in
|
||||
# build-tools.Dockerfile.
|
||||
# build-tools/Dockerfile.
|
||||
#
|
||||
# build-deps: Contains C compiler, other build tools, and compile-time dependencies
|
||||
# needed to compile PostgreSQL and most extensions. (Some extensions need
|
||||
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
|
||||
FROM $BASE_IMAGE_SHA AS build-deps
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Keep in sync with build-tools.Dockerfile
|
||||
# Keep in sync with build-tools/Dockerfile
|
||||
ENV PROTOC_VERSION=25.1
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
@@ -170,7 +170,29 @@ RUN case $DEBIAN_VERSION in \
|
||||
FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION:?} postgres
|
||||
COPY compute/patches/postgres_fdw.patch .
|
||||
COPY compute/patches/pg_stat_statements_pg14-16.patch .
|
||||
COPY compute/patches/pg_stat_statements_pg17.patch .
|
||||
RUN cd postgres && \
|
||||
# Apply patches to some contrib extensions
|
||||
# For example, we need to grant EXECUTE on pg_stat_statements_reset() to {privileged_role_name}.
|
||||
# In vanilla Postgres this function is limited to Postgres role superuser.
|
||||
# In Neon we have {privileged_role_name} role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the Postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new Postgres version and we want to limit the changes in our Postgres fork,
|
||||
# so we do it here.
|
||||
case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16") \
|
||||
patch -p1 < /pg_stat_statements_pg14-16.patch; \
|
||||
;; \
|
||||
"v17") \
|
||||
patch -p1 < /pg_stat_statements_pg17.patch; \
|
||||
;; \
|
||||
*) \
|
||||
# To do not forget to migrate patches to the next major version
|
||||
echo "No contrib patches for this PostgreSQL version" && exit 1;; \
|
||||
esac && \
|
||||
patch -p1 < /postgres_fdw.patch && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION:?}" != "v14" ]; then \
|
||||
@@ -184,8 +206,6 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
|
||||
file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
|
||||
echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -195,34 +215,7 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
# Note that there are no downgrade scripts for pg_stat_statements, so we \
|
||||
# don't have to modify any downgrade paths or (much) older versions: we only \
|
||||
# have to make sure every creation of the pg_stat_statements_reset function \
|
||||
# also adds execute permissions to the neon_superuser.
|
||||
case $filename in \
|
||||
pg_stat_statements--1.4.sql) \
|
||||
# pg_stat_statements_reset is first created with 1.4
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
;; \
|
||||
pg_stat_statements--1.6--1.7.sql) \
|
||||
# Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
;; \
|
||||
pg_stat_statements--1.10--1.11.sql) \
|
||||
# Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
|
||||
;; \
|
||||
esac; \
|
||||
done;
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
# Set PATH for all the subsequent build steps
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
@@ -1524,7 +1517,7 @@ WORKDIR /ext-src
|
||||
COPY compute/patches/pg_duckdb_v031.patch .
|
||||
COPY compute/patches/duckdb_v120.patch .
|
||||
# pg_duckdb build requires source dir to be a git repo to get submodules
|
||||
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# allow {privileged_role_name} to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# - extension management function duckdb.install_extension()
|
||||
# - access to duckdb.extensions table and its sequence
|
||||
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
|
||||
@@ -1790,7 +1783,7 @@ RUN set -e \
|
||||
#########################################################################################
|
||||
FROM build-deps AS exporters
|
||||
ARG TARGETARCH
|
||||
# Keep sql_exporter version same as in build-tools.Dockerfile and
|
||||
# Keep sql_exporter version same as in build-tools/Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"name": "neon-compute",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@sourcemeta/jsonschema": "9.3.4"
|
||||
}
|
||||
}
|
||||
@@ -1,22 +1,26 @@
|
||||
diff --git a/sql/anon.sql b/sql/anon.sql
|
||||
index 0cdc769..b450327 100644
|
||||
index 0cdc769..5eab1d6 100644
|
||||
--- a/sql/anon.sql
|
||||
+++ b/sql/anon.sql
|
||||
@@ -1141,3 +1141,15 @@ $$
|
||||
@@ -1141,3 +1141,19 @@ $$
|
||||
-- TODO : https://en.wikipedia.org/wiki/L-diversity
|
||||
|
||||
-- TODO : https://en.wikipedia.org/wiki/T-closeness
|
||||
+
|
||||
+-- NEON Patches
|
||||
+
|
||||
+GRANT ALL ON SCHEMA anon to neon_superuser;
|
||||
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ IF current_setting('server_version_num')::int >= 150000 THEN
|
||||
+ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser;
|
||||
+ END IF;
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT ALL ON SCHEMA anon to %I', privileged_role_name);
|
||||
+ EXECUTE format('GRANT ALL ON ALL TABLES IN SCHEMA anon TO %I', privileged_role_name);
|
||||
+
|
||||
+ IF current_setting('server_version_num')::int >= 150000 THEN
|
||||
+ EXECUTE format('GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO %I', privileged_role_name);
|
||||
+ END IF;
|
||||
+END $$;
|
||||
diff --git a/sql/init.sql b/sql/init.sql
|
||||
index 7da6553..9b6164b 100644
|
||||
|
||||
@@ -21,13 +21,21 @@ index 3235cc8..6b892bc 100644
|
||||
include Makefile.global
|
||||
|
||||
diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
|
||||
index d777d76..af60106 100644
|
||||
index d777d76..3b54396 100644
|
||||
--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
|
||||
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
|
||||
@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
|
||||
@@ -1056,3 +1056,14 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
|
||||
GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
|
||||
GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
|
||||
GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
|
||||
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
|
||||
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
|
||||
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO %I', privileged_role_name);
|
||||
+ EXECUTE format('GRANT ALL ON TABLE duckdb.extensions TO %I', privileged_role_name);
|
||||
+ EXECUTE format('GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
|
||||
34
compute/patches/pg_stat_statements_pg14-16.patch
Normal file
34
compute/patches/pg_stat_statements_pg14-16.patch
Normal file
@@ -0,0 +1,34 @@
|
||||
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
index 58cdf600fce..8be57a996f6 100644
|
||||
--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
|
||||
|
||||
-- Don't want this to be available to non-superusers.
|
||||
REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
index 6fc3fed4c93..256345a8f79 100644
|
||||
--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
|
||||
|
||||
-- Don't want this to be available to non-superusers.
|
||||
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
52
compute/patches/pg_stat_statements_pg17.patch
Normal file
52
compute/patches/pg_stat_statements_pg17.patch
Normal file
@@ -0,0 +1,52 @@
|
||||
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
|
||||
index 0bb2c397711..32764db1d8b 100644
|
||||
--- a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
|
||||
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
|
||||
@@ -80,3 +80,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
|
||||
|
||||
-- Don't want this to be available to non-superusers.
|
||||
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) FROM PUBLIC;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
\ No newline at end of file
|
||||
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
index 58cdf600fce..8be57a996f6 100644
|
||||
--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
|
||||
@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
|
||||
|
||||
-- Don't want this to be available to non-superusers.
|
||||
REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
index 6fc3fed4c93..256345a8f79 100644
|
||||
--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
|
||||
@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
|
||||
|
||||
-- Don't want this to be available to non-superusers.
|
||||
REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
17
compute/patches/postgres_fdw.patch
Normal file
17
compute/patches/postgres_fdw.patch
Normal file
@@ -0,0 +1,17 @@
|
||||
diff --git a/contrib/postgres_fdw/postgres_fdw--1.0.sql b/contrib/postgres_fdw/postgres_fdw--1.0.sql
|
||||
index a0f0fc1bf45..ee077f2eea6 100644
|
||||
--- a/contrib/postgres_fdw/postgres_fdw--1.0.sql
|
||||
+++ b/contrib/postgres_fdw/postgres_fdw--1.0.sql
|
||||
@@ -16,3 +16,12 @@ LANGUAGE C STRICT;
|
||||
CREATE FOREIGN DATA WRAPPER postgres_fdw
|
||||
HANDLER postgres_fdw_handler
|
||||
VALIDATOR postgres_fdw_validator;
|
||||
+
|
||||
+DO $$
|
||||
+DECLARE
|
||||
+ privileged_role_name text;
|
||||
+BEGIN
|
||||
+ privileged_role_name := current_setting('neon.privileged_role_name');
|
||||
+
|
||||
+ EXECUTE format('GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO %I', privileged_role_name);
|
||||
+END $$;
|
||||
@@ -7,7 +7,7 @@ license.workspace = true
|
||||
[features]
|
||||
default = []
|
||||
# Enables test specific features.
|
||||
testing = ["neon_failpoint/testing"]
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
async-compression.workspace = true
|
||||
@@ -23,7 +23,7 @@ camino.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
neon_failpoint.workspace = true
|
||||
fail.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
|
||||
@@ -87,6 +87,14 @@ struct Cli {
|
||||
#[arg(short = 'C', long, value_name = "DATABASE_URL")]
|
||||
pub connstr: String,
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
default_value = "neon_superuser",
|
||||
value_name = "PRIVILEGED_ROLE_NAME",
|
||||
value_parser = Self::parse_privileged_role_name
|
||||
)]
|
||||
pub privileged_role_name: String,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[arg(long, default_value = "neon-postgres")]
|
||||
pub cgroup: String,
|
||||
@@ -149,12 +157,27 @@ impl Cli {
|
||||
|
||||
Ok(url)
|
||||
}
|
||||
|
||||
/// For simplicity, we do not escape `privileged_role_name` anywhere in the code.
|
||||
/// Since it's a system role, which we fully control, that's fine. Still, let's
|
||||
/// validate it to avoid any surprises.
|
||||
fn parse_privileged_role_name(value: &str) -> Result<String> {
|
||||
use regex::Regex;
|
||||
|
||||
let pattern = Regex::new(r"^[a-z_]+$").unwrap();
|
||||
|
||||
if !pattern.is_match(value) {
|
||||
bail!("--privileged-role-name can only contain lowercase letters and underscores")
|
||||
}
|
||||
|
||||
Ok(value.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
failpoint_support::init().unwrap();
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// For historical reasons, the main thread that processes the config and launches postgres
|
||||
// is synchronous, but we always have this tokio runtime available and we "enter" it so
|
||||
@@ -178,6 +201,7 @@ fn main() -> Result<()> {
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
connstr,
|
||||
privileged_role_name: cli.privileged_role_name.clone(),
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
@@ -201,6 +225,8 @@ fn main() -> Result<()> {
|
||||
|
||||
let exit_code = compute_node.run()?;
|
||||
|
||||
scenario.teardown();
|
||||
|
||||
deinit_and_exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -325,4 +351,49 @@ mod test {
|
||||
])
|
||||
.expect_err("URL parameters are not allowed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_privileged_role_name() {
|
||||
// Valid name
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--privileged-role-name",
|
||||
"my_superuser",
|
||||
]);
|
||||
assert_eq!(cli.privileged_role_name, "my_superuser");
|
||||
|
||||
// Invalid names
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--privileged-role-name",
|
||||
"NeonSuperuser",
|
||||
])
|
||||
.expect_err("uppercase letters are not allowed");
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--privileged-role-name",
|
||||
"$'neon_superuser",
|
||||
])
|
||||
.expect_err("special characters are not allowed");
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--privileged-role-name",
|
||||
"",
|
||||
])
|
||||
.expect_err("empty name is not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,12 +74,20 @@ const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;
|
||||
|
||||
/// Static configuration params that don't change after startup. These mostly
|
||||
/// come from the CLI args, or are derived from them.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ComputeNodeParams {
|
||||
/// The ID of the compute
|
||||
pub compute_id: String,
|
||||
// Url type maintains proper escaping
|
||||
|
||||
/// Url type maintains proper escaping
|
||||
pub connstr: url::Url,
|
||||
|
||||
/// The name of the 'weak' superuser role, which we give to the users.
|
||||
/// It follows the allow list approach, i.e., we take a standard role
|
||||
/// and grant it extra permissions with explicit GRANTs here and there,
|
||||
/// and core patches.
|
||||
pub privileged_role_name: String,
|
||||
|
||||
pub resize_swap_on_bind: bool,
|
||||
pub set_disk_quota_for_fs: Option<String>,
|
||||
|
||||
@@ -1040,6 +1048,8 @@ impl ComputeNode {
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros =
|
||||
connected.duration_since(started).as_micros() as u64;
|
||||
@@ -1049,6 +1059,27 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Move the Zenith signal file to Neon signal file location.
|
||||
/// This makes Compute compatible with older PageServers that don't yet
|
||||
/// know about the Zenith->Neon rename.
|
||||
fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
|
||||
let datadir = Path::new(&self.params.pgdata);
|
||||
|
||||
let neonsig = datadir.join("neon.signal");
|
||||
|
||||
if neonsig.is_file() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let zenithsig = datadir.join("zenith.signal");
|
||||
|
||||
if zenithsig.is_file() {
|
||||
fs::copy(zenithsig, neonsig)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
@@ -1263,9 +1294,7 @@ impl ComputeNode {
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
// and we don't yet retry.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
@@ -1368,6 +1397,7 @@ impl ComputeNode {
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&self.params,
|
||||
&pspec.spec,
|
||||
self.params.internal_http_port,
|
||||
tls_config,
|
||||
@@ -1716,6 +1746,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
let params = self.params.clone();
|
||||
tokio::spawn(async move {
|
||||
let mut conf = conf.as_ref().clone();
|
||||
conf.application_name("compute_ctl:migrations");
|
||||
@@ -1727,7 +1758,7 @@ impl ComputeNode {
|
||||
eprintln!("connection error: {e}");
|
||||
}
|
||||
});
|
||||
if let Err(e) = handle_migrations(&mut client).await {
|
||||
if let Err(e) = handle_migrations(params, &mut client).await {
|
||||
error!("Failed to run migrations: {}", e);
|
||||
}
|
||||
}
|
||||
@@ -1806,6 +1837,7 @@ impl ComputeNode {
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&self.params,
|
||||
&spec,
|
||||
self.params.internal_http_port,
|
||||
tls_config,
|
||||
@@ -2418,14 +2450,31 @@ LIMIT 100",
|
||||
pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
|
||||
self.terminate_lfc_offload_task();
|
||||
let secs = interval.as_secs();
|
||||
info!("spawning lfc offload worker with {secs}s interval");
|
||||
let this = self.clone();
|
||||
|
||||
info!("spawning LFC offload worker with {secs}s interval");
|
||||
let handle = spawn(async move {
|
||||
let mut interval = time::interval(interval);
|
||||
interval.tick().await; // returns immediately
|
||||
loop {
|
||||
interval.tick().await;
|
||||
this.offload_lfc_async().await;
|
||||
|
||||
let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone();
|
||||
// Do not offload LFC state if we are currently prewarming or any issue occurred.
|
||||
// If we'd do that, we might override the LFC state in endpoint storage with some
|
||||
// incomplete state. Imagine a situation:
|
||||
// 1. Endpoint started with `autoprewarm: true`
|
||||
// 2. While prewarming is not completed, we upload the new incomplete state
|
||||
// 3. Compute gets interrupted and restarts
|
||||
// 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
|
||||
if matches!(
|
||||
prewarm_state,
|
||||
LfcPrewarmState::Completed
|
||||
| LfcPrewarmState::NotPrewarmed
|
||||
| LfcPrewarmState::Skipped
|
||||
) {
|
||||
this.offload_lfc_async().await;
|
||||
}
|
||||
}
|
||||
});
|
||||
*self.lfc_offload_task.lock().unwrap() = Some(handle);
|
||||
@@ -2464,7 +2513,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
serde_json::to_string(&extensions).expect("failed to serialize extensions list")
|
||||
);
|
||||
}
|
||||
Err(err) => error!("could not get installed extensions: {err:?}"),
|
||||
Err(err) => error!("could not get installed extensions: {err}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -89,7 +89,7 @@ impl ComputeNode {
|
||||
self.state.lock().unwrap().lfc_offload_state.clone()
|
||||
}
|
||||
|
||||
/// If there is a prewarm request ongoing, return false, true otherwise
|
||||
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
@@ -101,15 +101,25 @@ impl ComputeNode {
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "prewarming lfc");
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
let state = match cloned.prewarm_impl(from_endpoint).await {
|
||||
Ok(true) => LfcPrewarmState::Completed,
|
||||
Ok(false) => {
|
||||
info!(
|
||||
"skipping LFC prewarm because LFC state is not found in endpoint storage"
|
||||
);
|
||||
LfcPrewarmState::Skipped
|
||||
}
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "could not prewarm LFC");
|
||||
|
||||
LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = state;
|
||||
});
|
||||
true
|
||||
}
|
||||
@@ -120,15 +130,21 @@ impl ComputeNode {
|
||||
EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
|
||||
}
|
||||
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
|
||||
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
|
||||
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
let res = request.send().await.context("querying endpoint storage")?;
|
||||
let status = res.status();
|
||||
if status != StatusCode::OK {
|
||||
bail!("{status} querying endpoint storage")
|
||||
match status {
|
||||
StatusCode::OK => (),
|
||||
StatusCode::NOT_FOUND => {
|
||||
return Ok(false);
|
||||
}
|
||||
_ => bail!("{status} querying endpoint storage"),
|
||||
}
|
||||
|
||||
let mut uncompressed = Vec::new();
|
||||
@@ -141,7 +157,8 @@ impl ComputeNode {
|
||||
.await
|
||||
.context("decoding LFC state")?;
|
||||
let uncompressed_len = uncompressed.len();
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
|
||||
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
|
||||
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
@@ -149,7 +166,9 @@ impl ComputeNode {
|
||||
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())
|
||||
.map(|_| ())?;
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// If offload request is ongoing, return false, true otherwise
|
||||
@@ -177,12 +196,14 @@ impl ComputeNode {
|
||||
|
||||
async fn offload_lfc_with_state_update(&self) {
|
||||
crate::metrics::LFC_OFFLOADS.inc();
|
||||
|
||||
let Err(err) = self.offload_lfc_impl().await else {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "offloading lfc");
|
||||
error!(%err, "could not offload LFC state to endpoint storage");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
@@ -190,7 +211,7 @@ impl ComputeNode {
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from postgres");
|
||||
info!(%url, "requesting LFC state from Postgres");
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
@@ -205,13 +226,17 @@ impl ComputeNode {
|
||||
.read_to_end(&mut compressed)
|
||||
.await
|
||||
.context("compressing LFC state")?;
|
||||
|
||||
let compressed_len = compressed.len();
|
||||
info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
|
||||
|
||||
let request = Client::new().put(url).bearer_auth(token).body(compressed);
|
||||
match request.send().await {
|
||||
Ok(res) if res.status() == StatusCode::OK => Ok(()),
|
||||
Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()),
|
||||
Ok(res) => bail!(
|
||||
"Request to endpoint storage failed with status: {}",
|
||||
res.status()
|
||||
),
|
||||
Err(err) => Err(err).context("writing to endpoint storage"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::path::Path;
|
||||
use compute_api::responses::TlsConfig;
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
|
||||
use crate::compute::ComputeNodeParams;
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
};
|
||||
@@ -41,6 +42,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(
|
||||
pgdata_path: &Path,
|
||||
params: &ComputeNodeParams,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: u16,
|
||||
tls_config: &Option<TlsConfig>,
|
||||
@@ -54,14 +56,15 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "{conf}")?;
|
||||
}
|
||||
|
||||
// Stripe size GUC should be defined prior to connection string
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
@@ -161,6 +164,12 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(
|
||||
file,
|
||||
"neon.privileged_role_name={}",
|
||||
escape_conf_value(params.privileged_role_name.as_str())
|
||||
)?;
|
||||
|
||||
// If there are any extra options in the 'settings' field, append those
|
||||
if spec.cluster.settings.is_some() {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
|
||||
@@ -613,11 +613,11 @@ components:
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: Lfc prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed]
|
||||
description: LFC prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed, skipped]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc prewarm error, if any
|
||||
description: LFC prewarm error, if any
|
||||
type: string
|
||||
total:
|
||||
description: Total pages processed
|
||||
@@ -635,11 +635,11 @@ components:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Lfc offload status
|
||||
description: LFC offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc offload error, if any
|
||||
description: LFC offload error, if any
|
||||
type: string
|
||||
|
||||
PromoteState:
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use http::StatusCode;
|
||||
use neon_failpoint::{configure_failpoint, configure_failpoint_with_context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tracing::info;
|
||||
use utils::failpoint_support::apply_failpoint;
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
@@ -12,16 +11,10 @@ pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in neon_failpoint
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We support actions: "pause", "sleep(N)", "return", "return(value)", "exit", "off", "panic(message)"
|
||||
/// Plus probability-based actions: "N%return(value)", "N%M*return(value)", "N%action", "N%M*action"
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
/// Optional context matching rules for conditional failpoints
|
||||
/// Each key-value pair specifies a context key and a regex pattern to match against
|
||||
/// All context matchers must match for the failpoint to trigger
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub context_matchers: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
use crate::http::JsonResponse;
|
||||
@@ -31,7 +24,7 @@ use crate::http::extract::Json;
|
||||
pub(in crate::http) async fn configure_failpoints(
|
||||
failpoints: Json<ConfigureFailpointsRequest>,
|
||||
) -> Response {
|
||||
if !neon_failpoint::has_failpoints() {
|
||||
if !fail::has_failpoints() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"Cannot manage failpoints because neon was compiled without failpoints support",
|
||||
@@ -39,21 +32,16 @@ pub(in crate::http) async fn configure_failpoints(
|
||||
}
|
||||
|
||||
for fp in &*failpoints {
|
||||
info!(
|
||||
"cfg failpoint: {} {} (context: {:?})",
|
||||
fp.name, fp.actions, fp.context_matchers
|
||||
);
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
let cfg_result = if let Some(context_matchers) = fp.context_matchers.clone() {
|
||||
configure_failpoint_with_context(&fp.name, &fp.actions, context_matchers)
|
||||
} else {
|
||||
configure_failpoint(&fp.name, &fp.actions)
|
||||
};
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(e) = cfg_result {
|
||||
return JsonResponse::error(
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!("failed to configure failpoint '{}': {e}", fp.name),
|
||||
format!("failed to configure failpoints: {e}"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use tokio_postgres::error::Error as PostgresError;
|
||||
use tokio_postgres::{Client, Config, NoTls};
|
||||
|
||||
use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
/// and to make database listing query here more explicit.
|
||||
///
|
||||
/// Limit the number of databases to 500 to avoid excessive load.
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
|
||||
// `pg_database.datconnlimit = -2` means that the database is in the
|
||||
// invalid state
|
||||
let databases = client
|
||||
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
/// Same extension can be installed in multiple databases with different versions,
|
||||
/// so we report a separate metric (number of databases where it is installed)
|
||||
/// for each extension version.
|
||||
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
|
||||
pub async fn get_installed_extensions(
|
||||
mut conf: Config,
|
||||
) -> Result<InstalledExtensions, PostgresError> {
|
||||
conf.application_name("compute_ctl:get_installed_extensions");
|
||||
let databases: Vec<String> = {
|
||||
let (mut client, connection) = conf.connect(NoTls).await?;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use anyhow::{Context, Result};
|
||||
use neon_failpoint::fail_point;
|
||||
use fail::fail_point;
|
||||
use tokio_postgres::{Client, Transaction};
|
||||
use tracing::{error, info};
|
||||
|
||||
@@ -40,14 +40,13 @@ impl<'m> MigrationRunner<'m> {
|
||||
// middle of applying a series of migrations fails in an expected
|
||||
// manner
|
||||
if cfg!(feature = "testing") {
|
||||
let fail = async {
|
||||
fail_point!("compute-migration", |fail_migration_id: Option<String>| {
|
||||
let fail = (|| {
|
||||
fail_point!("compute-migration", |fail_migration_id| {
|
||||
migration_id == fail_migration_id.unwrap().parse::<i64>().unwrap()
|
||||
});
|
||||
|
||||
false
|
||||
}
|
||||
.await;
|
||||
})();
|
||||
|
||||
if fail {
|
||||
return Err(anyhow::anyhow!(format!(
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
ALTER ROLE {privileged_role_name} BYPASSRLS;
|
||||
@@ -1 +0,0 @@
|
||||
ALTER ROLE neon_superuser BYPASSRLS;
|
||||
@@ -15,7 +15,7 @@ DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
@@ -23,7 +23,7 @@ BEGIN
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -1 +0,0 @@
|
||||
GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_monitor TO {privileged_role_name} WITH ADMIN OPTION;
|
||||
@@ -1,4 +1,4 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
-- interacted with by {privileged_role_name} without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name};
|
||||
@@ -1,4 +1,4 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
-- interacted with by {privileged_role_name} without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name};
|
||||
@@ -1,3 +1,3 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name} WITH GRANT OPTION;
|
||||
@@ -1,3 +1,3 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name} WITH GRANT OPTION;
|
||||
@@ -1,7 +1,7 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -1 +0,0 @@
|
||||
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
|
||||
@@ -0,0 +1 @@
|
||||
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO {privileged_role_name};
|
||||
@@ -1 +0,0 @@
|
||||
GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_signal_backend TO {privileged_role_name} WITH ADMIN OPTION;
|
||||
@@ -9,6 +9,7 @@ use reqwest::StatusCode;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use crate::compute::ComputeNodeParams;
|
||||
use crate::config;
|
||||
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
|
||||
use crate::migration::MigrationRunner;
|
||||
@@ -169,7 +170,7 @@ pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
pub async fn handle_migrations(params: ComputeNodeParams, client: &mut Client) -> Result<()> {
|
||||
info!("handle migrations");
|
||||
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
@@ -178,26 +179,59 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0002-alter_roles.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
&format!(
|
||||
include_str!("./migrations/0001-add_bypass_rls_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
&format!(
|
||||
include_str!("./migrations/0002-alter_roles.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!("./migrations/0003-grant_pg_create_subscription_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!("./migrations/0004-grant_pg_monitor_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!("./migrations/0005-grant_all_on_tables_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!("./migrations/0006-grant_all_on_sequences_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql"
|
||||
),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!(
|
||||
"./migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql"
|
||||
),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
include_str!(
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
&format!(
|
||||
include_str!(
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql"
|
||||
),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
&format!(
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql"
|
||||
),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
&format!(
|
||||
include_str!("./migrations/0012-grant_pg_signal_backend_to_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations)
|
||||
|
||||
@@ -13,14 +13,14 @@ use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeState};
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
|
||||
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
@@ -49,6 +49,7 @@ impl ComputeNode {
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let client = Self::get_maintenance_client(&conf).await?;
|
||||
let spec = spec.clone();
|
||||
let params = Arc::new(self.params.clone());
|
||||
|
||||
let databases = get_existing_dbs_async(&client).await?;
|
||||
let roles = get_existing_roles_async(&client)
|
||||
@@ -157,6 +158,7 @@ impl ComputeNode {
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let fut = Self::apply_spec_sql_db(
|
||||
params.clone(),
|
||||
spec.clone(),
|
||||
conf,
|
||||
ctx.clone(),
|
||||
@@ -185,7 +187,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreateNeonSuperuser,
|
||||
CreatePrivilegedRole,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -195,6 +197,7 @@ impl ComputeNode {
|
||||
] {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
@@ -243,6 +246,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
let fut = Self::apply_spec_sql_db(
|
||||
params.clone(),
|
||||
spec.clone(),
|
||||
conf,
|
||||
ctx.clone(),
|
||||
@@ -293,6 +297,7 @@ impl ComputeNode {
|
||||
for phase in phases {
|
||||
debug!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
@@ -313,7 +318,9 @@ impl ComputeNode {
|
||||
/// May opt to not connect to databases that don't have any scheduled
|
||||
/// operations. The function is concurrency-controlled with the provided
|
||||
/// semaphore. The caller has to make sure the semaphore isn't exhausted.
|
||||
#[allow(clippy::too_many_arguments)] // TODO: needs bigger refactoring
|
||||
async fn apply_spec_sql_db(
|
||||
params: Arc<ComputeNodeParams>,
|
||||
spec: Arc<ComputeSpec>,
|
||||
conf: Arc<tokio_postgres::Config>,
|
||||
ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
|
||||
@@ -328,6 +335,7 @@ impl ComputeNode {
|
||||
|
||||
for subphase in subphases {
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
@@ -467,7 +475,7 @@ pub enum PerDatabasePhase {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreateNeonSuperuser,
|
||||
CreatePrivilegedRole,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -510,6 +518,7 @@ pub struct MutableApplyContext {
|
||||
/// - No timeouts have (yet) been implemented.
|
||||
/// - The caller is responsible for limiting and/or applying concurrency.
|
||||
pub async fn apply_operations<'a, Fut, F>(
|
||||
params: Arc<ComputeNodeParams>,
|
||||
spec: Arc<ComputeSpec>,
|
||||
ctx: Arc<RwLock<MutableApplyContext>>,
|
||||
jwks_roles: Arc<HashSet<String>>,
|
||||
@@ -527,7 +536,7 @@ where
|
||||
debug!("Processing phase {:?}", &apply_spec_phase);
|
||||
let ctx = ctx;
|
||||
|
||||
let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase)
|
||||
let mut ops = get_operations(¶ms, &spec, &ctx, &jwks_roles, &apply_spec_phase)
|
||||
.await?
|
||||
.peekable();
|
||||
|
||||
@@ -588,14 +597,18 @@ where
|
||||
/// sort/merge/batch execution, but for now this is a nice way to improve
|
||||
/// batching behavior of the commands.
|
||||
async fn get_operations<'a>(
|
||||
params: &'a ComputeNodeParams,
|
||||
spec: &'a ComputeSpec,
|
||||
ctx: &'a RwLock<MutableApplyContext>,
|
||||
jwks_roles: &'a HashSet<String>,
|
||||
apply_spec_phase: &'a ApplySpecPhase,
|
||||
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
|
||||
match apply_spec_phase {
|
||||
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
|
||||
query: include_str!("sql/create_neon_superuser.sql").to_string(),
|
||||
ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
|
||||
query: format!(
|
||||
include_str!("sql/create_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
),
|
||||
comment: None,
|
||||
}))),
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
@@ -697,8 +710,9 @@ async fn get_operations<'a>(
|
||||
None => {
|
||||
let query = if !jwks_roles.contains(role.name.as_str()) {
|
||||
format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE {} {}",
|
||||
role.name.pg_quote(),
|
||||
params.privileged_role_name,
|
||||
role.to_pg_options(),
|
||||
)
|
||||
} else {
|
||||
@@ -849,8 +863,9 @@ async fn get_operations<'a>(
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
query: format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
db.name.pg_quote()
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO {}",
|
||||
db.name.pg_quote(),
|
||||
params.privileged_role_name
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
8
compute_tools/src/sql/create_privileged_role.sql
Normal file
8
compute_tools/src/sql/create_privileged_role.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
|
||||
THEN
|
||||
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
@@ -8,10 +8,10 @@ code changes locally, but not suitable for running production systems.
|
||||
|
||||
## Example: Start with Postgres 16
|
||||
|
||||
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
|
||||
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands.
|
||||
|
||||
```shell
|
||||
cargo neon init --pg-version 16
|
||||
cargo neon init
|
||||
cargo neon start
|
||||
cargo neon tenant create --set-default --pg-version 16
|
||||
cargo neon endpoint create main --pg-version 16
|
||||
|
||||
@@ -631,6 +631,10 @@ struct EndpointCreateCmdArgs {
|
||||
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
|
||||
)]
|
||||
allow_multiple: bool,
|
||||
|
||||
/// Only allow changing it on creation
|
||||
#[clap(long, help = "Name of the privileged role for the endpoint")]
|
||||
privileged_role_name: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -1480,6 +1484,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
args.grpc,
|
||||
!args.update_catalog,
|
||||
false,
|
||||
args.privileged_role_name.clone(),
|
||||
)?;
|
||||
}
|
||||
EndpointCmd::Start(args) => {
|
||||
|
||||
@@ -36,7 +36,7 @@ impl StorageBroker {
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
let broker = &self.env.broker;
|
||||
|
||||
print!("Starting neon broker at {}", broker.client_url());
|
||||
println!("Starting neon broker at {}", broker.client_url());
|
||||
|
||||
let mut args = Vec::new();
|
||||
|
||||
|
||||
@@ -32,7 +32,8 @@
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
//! neon.signal
|
||||
//! zenith.signal - copy of neon.signal, for backward compatibility
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
@@ -98,6 +99,7 @@ pub struct EndpointConf {
|
||||
features: Vec<ComputeFeature>,
|
||||
cluster: Option<Cluster>,
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
privileged_role_name: Option<String>,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -198,6 +200,7 @@ impl ComputeControlPlane {
|
||||
grpc: bool,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
privileged_role_name: Option<String>,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
@@ -235,6 +238,7 @@ impl ComputeControlPlane {
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
compute_ctl_config: compute_ctl_config.clone(),
|
||||
privileged_role_name: privileged_role_name.clone(),
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -256,6 +260,7 @@ impl ComputeControlPlane {
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
compute_ctl_config,
|
||||
privileged_role_name,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -331,6 +336,9 @@ pub struct Endpoint {
|
||||
|
||||
/// The compute_ctl config for the endpoint's compute.
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
/// The name of the privileged role for the endpoint.
|
||||
privileged_role_name: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
@@ -431,6 +439,7 @@ impl Endpoint {
|
||||
features: conf.features,
|
||||
cluster: conf.cluster,
|
||||
compute_ctl_config: conf.compute_ctl_config,
|
||||
privileged_role_name: conf.privileged_role_name,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -463,7 +472,7 @@ impl Endpoint {
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_level", "logical");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
// It also defines how often the walreceiver will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.pg_address.ip().to_string());
|
||||
conf.append("port", &self.pg_address.port().to_string());
|
||||
@@ -869,6 +878,10 @@ impl Endpoint {
|
||||
cmd.arg("--dev");
|
||||
}
|
||||
|
||||
if let Some(privileged_role_name) = self.privileged_role_name.clone() {
|
||||
cmd.args(["--privileged-role-name", &privileged_role_name]);
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
|
||||
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
|
||||
pub posthog_config: Option<PostHogConfig>,
|
||||
|
||||
pub kick_secondary_downloads: Option<bool>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub shard_split_request_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
|
||||
timeline_safekeeper_count: None,
|
||||
posthog_config: None,
|
||||
kick_secondary_downloads: None,
|
||||
shard_split_request_timeout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,7 +303,7 @@ impl PageServerNode {
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
println!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
|
||||
@@ -127,7 +127,7 @@ impl SafekeeperNode {
|
||||
extra_opts: &[String],
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
print!(
|
||||
println!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
|
||||
@@ -648,6 +648,13 @@ impl StorageController {
|
||||
args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
|
||||
}
|
||||
|
||||
if let Some(duration) = self.config.shard_split_request_timeout {
|
||||
args.push(format!(
|
||||
"--shard-split-request-timeout={}",
|
||||
humantime::Duration::from(duration)
|
||||
));
|
||||
}
|
||||
|
||||
let mut envs = vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
@@ -660,7 +667,7 @@ impl StorageController {
|
||||
));
|
||||
}
|
||||
|
||||
println!("Starting storage controller");
|
||||
println!("Starting storage controller at {scheme}://{host}:{listen_port}");
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
|
||||
@@ -14,6 +14,7 @@ humantime.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
safekeeper_api.workspace=true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
|
||||
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Certificate, Method, StatusCode, Url};
|
||||
use safekeeper_api::models::TimelineLocateResponse;
|
||||
use storage_controller_client::control_api::Client;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -75,6 +76,12 @@ enum Command {
|
||||
NodeStartDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
/// When `force` is true, skip waiting for shards to prewarm during migration.
|
||||
/// This can significantly speed up node deletion since prewarming all shards
|
||||
/// can take considerable time, but may result in slower initial access to
|
||||
/// migrated shards until they warm up naturally.
|
||||
#[arg(long)]
|
||||
force: bool,
|
||||
},
|
||||
/// Cancel deletion of the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
@@ -279,6 +286,23 @@ enum Command {
|
||||
#[arg(long)]
|
||||
concurrency: Option<usize>,
|
||||
},
|
||||
/// Locate safekeepers for a timeline from the storcon DB.
|
||||
TimelineLocate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
/// Migrate a timeline to a new set of safekeepers
|
||||
TimelineSafekeeperMigrate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
/// Example: --new-sk-set 1,2,3
|
||||
#[arg(long, required = true, value_delimiter = ',')]
|
||||
new_sk_set: Vec<NodeId>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -458,6 +482,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
node_ip_addr: None,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -933,13 +958,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeStartDelete { node_id } => {
|
||||
Command::NodeStartDelete { node_id, force } => {
|
||||
let query = if force {
|
||||
format!("control/v1/node/{node_id}/delete?force=true")
|
||||
} else {
|
||||
format!("control/v1/node/{node_id}/delete")
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/delete"),
|
||||
None,
|
||||
)
|
||||
.dispatch::<(), ()>(Method::PUT, query, None)
|
||||
.await?;
|
||||
println!("Delete started for {node_id}");
|
||||
}
|
||||
@@ -1324,7 +1350,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
concurrency,
|
||||
} => {
|
||||
let mut path = format!(
|
||||
"/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
"v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
);
|
||||
|
||||
if let Some(c) = concurrency {
|
||||
@@ -1335,6 +1361,41 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::POST, path, None)
|
||||
.await?;
|
||||
}
|
||||
Command::TimelineLocate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} => {
|
||||
let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
|
||||
let new_sk_set = resp
|
||||
.new_sk_set
|
||||
.as_ref()
|
||||
.map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
|
||||
|
||||
println!("generation = {}", resp.generation);
|
||||
println!("sk_set = {sk_set:?}");
|
||||
println!("new_sk_set = {new_sk_set:?}");
|
||||
}
|
||||
Command::TimelineSafekeeperMigrate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
new_sk_set,
|
||||
} => {
|
||||
let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
|
||||
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
path,
|
||||
Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
|
||||
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
|
||||
from WAL.
|
||||
|
||||
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
|
||||
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
|
||||
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
|
||||
This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup
|
||||
code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
|
||||
instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without
|
||||
any WAL redo.
|
||||
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
@@ -75,7 +75,7 @@ CLI examples:
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
For local S3 installations, refer to their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
|
||||
Required sections are:
|
||||
|
||||
18
libs/alloc-metrics/Cargo.toml
Normal file
18
libs/alloc-metrics/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "alloc-metrics"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
metrics.workspace = true
|
||||
measured.workspace = true
|
||||
thread_local.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
|
||||
[[bench]]
|
||||
harness = false
|
||||
name = "alloc"
|
||||
110
libs/alloc-metrics/benches/alloc.rs
Normal file
110
libs/alloc-metrics/benches/alloc.rs
Normal file
@@ -0,0 +1,110 @@
|
||||
use std::alloc::{GlobalAlloc, Layout, System, handle_alloc_error};
|
||||
|
||||
use alloc_metrics::TrackedAllocator;
|
||||
use criterion::{
|
||||
AxisScale, BenchmarkGroup, BenchmarkId, Criterion, PlotConfiguration, measurement::Measurement,
|
||||
};
|
||||
use measured::FixedCardinalityLabel;
|
||||
use tikv_jemallocator::Jemalloc;
|
||||
|
||||
fn main() {
|
||||
let mut c = Criterion::default().configure_from_args();
|
||||
bench(&mut c);
|
||||
c.final_summary();
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
fn bench(c: &mut Criterion) {
|
||||
bench_alloc(c.benchmark_group("alloc/system"), &System, &ALLOC_SYSTEM);
|
||||
bench_alloc(c.benchmark_group("alloc/jemalloc"), &Jemalloc, &ALLOC_JEMALLOC);
|
||||
|
||||
bench_dealloc(c.benchmark_group("dealloc/system"), &System, &ALLOC_SYSTEM);
|
||||
bench_dealloc(c.benchmark_group("dealloc/jemalloc"), &Jemalloc, &ALLOC_JEMALLOC);
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
#[label(singleton = "memory_context")]
|
||||
pub enum MemoryContext {
|
||||
Root,
|
||||
Test,
|
||||
}
|
||||
|
||||
static ALLOC_SYSTEM: TrackedAllocator<System, MemoryContext> =
|
||||
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
|
||||
static ALLOC_JEMALLOC: TrackedAllocator<Jemalloc, MemoryContext> =
|
||||
unsafe { TrackedAllocator::new(Jemalloc, MemoryContext::Root) };
|
||||
|
||||
const KB: u64 = 1024;
|
||||
const SIZES: [u64; 6] = [64, 256, KB, 4 * KB, 16 * KB, KB * KB];
|
||||
|
||||
fn bench_alloc<A: GlobalAlloc>(
|
||||
mut g: BenchmarkGroup<'_, impl Measurement>,
|
||||
alloc1: &'static A,
|
||||
alloc2: &'static TrackedAllocator<A, MemoryContext>,
|
||||
) {
|
||||
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
|
||||
for size in SIZES {
|
||||
let layout = Layout::from_size_align(size as usize, 8).unwrap();
|
||||
|
||||
g.throughput(criterion::Throughput::Bytes(size));
|
||||
g.bench_with_input(BenchmarkId::new("default", size), &layout, |b, &layout| {
|
||||
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
|
||||
b.iter_batched(|| {}, |()| Alloc::new(alloc1, layout), bs);
|
||||
});
|
||||
g.bench_with_input(BenchmarkId::new("tracked", size), &layout, |b, &layout| {
|
||||
let _scope = alloc2.scope(MemoryContext::Test);
|
||||
|
||||
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
|
||||
b.iter_batched(|| {}, |()| Alloc::new(alloc2, layout), bs);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_dealloc<A: GlobalAlloc>(
|
||||
mut g: BenchmarkGroup<'_, impl Measurement>,
|
||||
alloc1: &'static A,
|
||||
alloc2: &'static TrackedAllocator<A, MemoryContext>,
|
||||
) {
|
||||
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
|
||||
for size in SIZES {
|
||||
let layout = Layout::from_size_align(size as usize, 8).unwrap();
|
||||
|
||||
g.throughput(criterion::Throughput::Bytes(size));
|
||||
g.bench_with_input(BenchmarkId::new("default", size), &layout, |b, &layout| {
|
||||
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
|
||||
b.iter_batched(|| Alloc::new(alloc1, layout), drop, bs);
|
||||
});
|
||||
g.bench_with_input(BenchmarkId::new("tracked", size), &layout, |b, &layout| {
|
||||
let _scope = alloc2.scope(MemoryContext::Test);
|
||||
|
||||
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
|
||||
b.iter_batched(|| Alloc::new(alloc2, layout), drop, bs);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
struct Alloc<'a, A: GlobalAlloc> {
|
||||
alloc: &'a A,
|
||||
ptr: *mut u8,
|
||||
layout: Layout,
|
||||
}
|
||||
|
||||
impl<'a, A: GlobalAlloc> Alloc<'a, A> {
|
||||
fn new(alloc: &'a A, layout: Layout) -> Self {
|
||||
let ptr = unsafe { alloc.alloc(layout) };
|
||||
if ptr.is_null() {
|
||||
handle_alloc_error(layout);
|
||||
}
|
||||
|
||||
// actually make the page resident.
|
||||
unsafe { ptr.cast::<u8>().write(1) };
|
||||
|
||||
Self { alloc, ptr, layout }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, A: GlobalAlloc> Drop for Alloc<'a, A> {
|
||||
fn drop(&mut self) {
|
||||
unsafe { self.alloc.dealloc(self.ptr, self.layout) };
|
||||
}
|
||||
}
|
||||
48
libs/alloc-metrics/src/counters.rs
Normal file
48
libs/alloc-metrics/src/counters.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use measured::{
|
||||
FixedCardinalityLabel, LabelGroup, label::StaticLabelSet, metric::MetricFamilyEncoding,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, Dec, Inc, MeasuredCounterPairState};
|
||||
|
||||
use crate::metric_vec::DenseMetricVec;
|
||||
|
||||
pub struct DenseCounterPairVec<
|
||||
A: CounterPairAssoc<LabelGroupSet = StaticLabelSet<L>>,
|
||||
L: FixedCardinalityLabel + LabelGroup,
|
||||
> {
|
||||
pub vec: DenseMetricVec<MeasuredCounterPairState, L>,
|
||||
pub _marker: PhantomData<A>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc<LabelGroupSet = StaticLabelSet<L>>, L: FixedCardinalityLabel + LabelGroup>
|
||||
DenseCounterPairVec<A, L>
|
||||
{
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
vec: DenseMetricVec::new(),
|
||||
_marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, A, L> ::measured::metric::group::MetricGroup<T> for DenseCounterPairVec<A, L>
|
||||
where
|
||||
T: ::measured::metric::group::Encoding,
|
||||
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
|
||||
A: CounterPairAssoc<LabelGroupSet = StaticLabelSet<L>>,
|
||||
L: FixedCardinalityLabel + LabelGroup,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||
// write decrement first to avoid a race condition where inc - dec < 0
|
||||
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
|
||||
|
||||
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
441
libs/alloc-metrics/src/lib.rs
Normal file
441
libs/alloc-metrics/src/lib.rs
Normal file
@@ -0,0 +1,441 @@
|
||||
//! Tagged allocator measurements.
|
||||
|
||||
mod counters;
|
||||
mod metric_vec;
|
||||
|
||||
use std::{
|
||||
alloc::{GlobalAlloc, Layout},
|
||||
cell::Cell,
|
||||
marker::PhantomData,
|
||||
sync::{
|
||||
OnceLock,
|
||||
atomic::{AtomicU64, Ordering::Relaxed},
|
||||
},
|
||||
};
|
||||
|
||||
use measured::{
|
||||
FixedCardinalityLabel, LabelGroup, MetricGroup,
|
||||
label::StaticLabelSet,
|
||||
metric::{MetricEncoding, counter::CounterState, group::Encoding, name::MetricName},
|
||||
};
|
||||
use metrics::{CounterPairAssoc, MeasuredCounterPairState};
|
||||
use thread_local::ThreadLocal;
|
||||
|
||||
type AllocCounter<T> = counters::DenseCounterPairVec<AllocPair<T>, T>;
|
||||
|
||||
pub struct TrackedAllocator<A, T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup> {
|
||||
inner: A,
|
||||
|
||||
/// potentially high-content fallback if the thread was not registered.
|
||||
default_counters: MeasuredCounterPairState,
|
||||
/// Default tag to use if this thread is not registered.
|
||||
default_tag: T,
|
||||
|
||||
thread: OnceLock<RegisteredThread<T>>,
|
||||
|
||||
/// where thread alloc data is eventually saved to, even if threads are shutdown.
|
||||
global: OnceLock<AllocCounter<T>>,
|
||||
}
|
||||
|
||||
impl<A, T> TrackedAllocator<A, T>
|
||||
where
|
||||
T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup,
|
||||
{
|
||||
/// # Safety
|
||||
///
|
||||
/// [`FixedCardinalityLabel`] must be implemented correctly, fully dense, and must not panic.
|
||||
pub const unsafe fn new(alloc: A, default: T) -> Self {
|
||||
TrackedAllocator {
|
||||
inner: alloc,
|
||||
default_tag: default,
|
||||
default_counters: MeasuredCounterPairState {
|
||||
inc: CounterState {
|
||||
count: AtomicU64::new(0),
|
||||
},
|
||||
dec: CounterState {
|
||||
count: AtomicU64::new(0),
|
||||
},
|
||||
},
|
||||
thread: OnceLock::new(),
|
||||
global: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocations
|
||||
pub fn register_thread(&'static self) {
|
||||
self.register_thread_inner();
|
||||
}
|
||||
|
||||
pub fn scope(&'static self, tag: T) -> AllocScope<'static, T> {
|
||||
let cell = self.register_thread_inner();
|
||||
let last = cell.replace(tag);
|
||||
AllocScope { cell, last }
|
||||
}
|
||||
|
||||
fn register_thread_inner(&'static self) -> &'static Cell<T> {
|
||||
let thread = self.thread.get_or_init(|| RegisteredThread {
|
||||
scope: ThreadLocal::new(),
|
||||
state: ThreadLocal::new(),
|
||||
});
|
||||
|
||||
thread.state.get_or(|| ThreadState {
|
||||
counters: AllocCounter::new(),
|
||||
global: self.global.get_or_init(AllocCounter::new),
|
||||
});
|
||||
|
||||
thread.scope.get_or(|| Cell::new(self.default_tag))
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! alloc {
|
||||
($alloc_fn:ident) => {
|
||||
unsafe fn $alloc_fn(&self, layout: Layout) -> *mut u8 {
|
||||
let Ok((tagged_layout, tag_offset)) = layout.extend(Layout::new::<T>()) else {
|
||||
return std::ptr::null_mut();
|
||||
};
|
||||
let tagged_layout = tagged_layout.pad_to_align();
|
||||
|
||||
// Safety: The layout is not zero-sized.
|
||||
let ptr = unsafe { self.inner.$alloc_fn(tagged_layout) };
|
||||
|
||||
// allocation failed.
|
||||
if ptr.is_null() {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// We are being very careful here to not allocate or panic.
|
||||
let thread = self.thread.get().map(|s| (s.scope.get(), s.state.get()));
|
||||
let tag = thread.and_then(|t| t.0).map_or(self.default_tag, Cell::get);
|
||||
|
||||
// Allocation successful. Write our tag
|
||||
// Safety: tag_offset is inbounds of the ptr
|
||||
unsafe { ptr.add(tag_offset).cast::<T>().write(tag) }
|
||||
|
||||
let counters = thread.and_then(|t| t.1).map(|s| &s.counters);
|
||||
let metric = if let Some(counters) = counters {
|
||||
counters.vec.get_metric(tag)
|
||||
} else {
|
||||
// if tag is not default, then the thread state would have been registered, therefore tag must be default.
|
||||
&self.default_counters
|
||||
};
|
||||
|
||||
metric.inc.count.fetch_add(layout.size() as u64, Relaxed);
|
||||
|
||||
ptr
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// We will tag our allocation by adding `T` to the end of the layout.
|
||||
// This is ok only as long as it does not overflow. If it does, we will
|
||||
// just fail the allocation by returning null.
|
||||
//
|
||||
// Safety: we will not unwind during alloc, and we will ensure layouts are handled correctly.
|
||||
unsafe impl<A, T> GlobalAlloc for TrackedAllocator<A, T>
|
||||
where
|
||||
A: GlobalAlloc,
|
||||
T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup,
|
||||
{
|
||||
alloc!(alloc);
|
||||
alloc!(alloc_zeroed);
|
||||
|
||||
unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
|
||||
// SAFETY: the caller must ensure that the `new_size` does not overflow.
|
||||
// `layout.align()` comes from a `Layout` and is thus guaranteed to be valid.
|
||||
let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) };
|
||||
|
||||
let Ok((new_tagged_layout, new_tag_offset)) = new_layout.extend(Layout::new::<T>()) else {
|
||||
return std::ptr::null_mut();
|
||||
};
|
||||
let new_tagged_layout = new_tagged_layout.pad_to_align();
|
||||
|
||||
let Ok((tagged_layout, tag_offset)) = layout.extend(Layout::new::<T>()) else {
|
||||
// Safety: This layout clearly did not match what was originally allocated,
|
||||
// otherwise alloc() would have caught this error and returned null.
|
||||
unsafe { std::hint::unreachable_unchecked() }
|
||||
};
|
||||
let tagged_layout = tagged_layout.pad_to_align();
|
||||
|
||||
// get the tag set during alloc
|
||||
// Safety: tag_offset is inbounds of the ptr
|
||||
let tag = unsafe { ptr.add(tag_offset).cast::<T>().read() };
|
||||
|
||||
// Safety: layout sizes are correct
|
||||
let new_ptr = unsafe {
|
||||
self.inner
|
||||
.realloc(ptr, tagged_layout, new_tagged_layout.size())
|
||||
};
|
||||
|
||||
// allocation failed.
|
||||
if new_ptr.is_null() {
|
||||
return new_ptr;
|
||||
}
|
||||
|
||||
// We are being very careful here to not allocate or panic.
|
||||
let thread = self.thread.get().map(|s| (s.scope.get(), s.state.get()));
|
||||
let new_tag = thread.and_then(|t| t.0).map_or(self.default_tag, Cell::get);
|
||||
|
||||
// Allocation successful. Write our tag
|
||||
// Safety: new_tag_offset is inbounds of the ptr
|
||||
unsafe { new_ptr.add(new_tag_offset).cast::<T>().write(new_tag) }
|
||||
|
||||
let counters = thread.and_then(|t| t.1).map(|s| &s.counters);
|
||||
let counters = counters.or_else(|| self.global.get());
|
||||
let (new_metric, old_metric) = if let Some(counters) = counters {
|
||||
let new_metric = counters.vec.get_metric(new_tag);
|
||||
let old_metric = counters.vec.get_metric(tag);
|
||||
|
||||
(new_metric, old_metric)
|
||||
} else {
|
||||
// no tag was registered at all, therefore both tags must be default.
|
||||
(&self.default_counters, &self.default_counters)
|
||||
};
|
||||
|
||||
let (inc, dec) = if tag.encode() != new_tag.encode() {
|
||||
(new_layout.size() as u64, layout.size() as u64)
|
||||
} else if new_layout.size() > layout.size() {
|
||||
((new_layout.size() - layout.size()) as u64, 0)
|
||||
} else {
|
||||
(0, (layout.size() - new_layout.size()) as u64)
|
||||
};
|
||||
|
||||
new_metric.inc.count.fetch_add(inc, Relaxed);
|
||||
old_metric.dec.count.fetch_add(dec, Relaxed);
|
||||
|
||||
new_ptr
|
||||
}
|
||||
|
||||
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
||||
let Ok((tagged_layout, tag_offset)) = layout.extend(Layout::new::<T>()) else {
|
||||
// Safety: This layout clearly did not match what was originally allocated,
|
||||
// otherwise alloc() would have caught this error and returned null.
|
||||
unsafe { std::hint::unreachable_unchecked() }
|
||||
};
|
||||
let tagged_layout = tagged_layout.pad_to_align();
|
||||
|
||||
// get the tag set during alloc
|
||||
// Safety: tag_offset is inbounds of the ptr
|
||||
let tag = unsafe { ptr.add(tag_offset).cast::<T>().read() };
|
||||
|
||||
// Safety: caller upholds contract for us
|
||||
unsafe { self.inner.dealloc(ptr, tagged_layout) }
|
||||
|
||||
// We are being very careful here to not allocate or panic.
|
||||
let thread = self.thread.get().map(|s| (s.scope.get(), s.state.get()));
|
||||
let counters = thread.and_then(|t| t.1).map(|s| &s.counters);
|
||||
let counters = counters.or_else(|| self.global.get());
|
||||
|
||||
let metric = if let Some(counters) = counters {
|
||||
counters.vec.get_metric(tag)
|
||||
} else {
|
||||
// if tag is not default, then global would have been registered, therefore tag must be default.
|
||||
&self.default_counters
|
||||
};
|
||||
|
||||
metric.dec.count.fetch_add(layout.size() as u64, Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AllocScope<'a, T: FixedCardinalityLabel> {
|
||||
cell: &'a Cell<T>,
|
||||
last: T,
|
||||
}
|
||||
|
||||
impl<'a, T: FixedCardinalityLabel> Drop for AllocScope<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
self.cell.set(self.last);
|
||||
}
|
||||
}
|
||||
|
||||
struct AllocPair<T>(PhantomData<T>);
|
||||
|
||||
impl<T: FixedCardinalityLabel + LabelGroup> CounterPairAssoc for AllocPair<T> {
|
||||
const INC_NAME: &'static MetricName = MetricName::from_str("allocated_bytes");
|
||||
const DEC_NAME: &'static MetricName = MetricName::from_str("deallocated_bytes");
|
||||
|
||||
const INC_HELP: &'static str = "total number of bytes allocated";
|
||||
const DEC_HELP: &'static str = "total number of bytes deallocated";
|
||||
|
||||
type LabelGroupSet = StaticLabelSet<T>;
|
||||
}
|
||||
|
||||
struct RegisteredThread<T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup> {
|
||||
/// Current memory context for this thread.
|
||||
scope: ThreadLocal<Cell<T>>,
|
||||
/// per thread state containing low contention counters for faster allocations.
|
||||
state: ThreadLocal<ThreadState<T>>,
|
||||
}
|
||||
|
||||
struct ThreadState<T: 'static + FixedCardinalityLabel + LabelGroup> {
|
||||
counters: AllocCounter<T>,
|
||||
global: &'static AllocCounter<T>,
|
||||
}
|
||||
|
||||
// Ensure the counters are measured on thread destruction.
|
||||
impl<T: 'static + FixedCardinalityLabel + LabelGroup> Drop for ThreadState<T> {
|
||||
fn drop(&mut self) {
|
||||
// iterate over all labels
|
||||
for tag in (0..T::cardinality()).map(T::decode) {
|
||||
// load and reset the counts in the thread-local counters.
|
||||
let m = self.counters.vec.get_metric_mut(tag);
|
||||
let inc = *m.inc.count.get_mut();
|
||||
let dec = *m.dec.count.get_mut();
|
||||
|
||||
// add the counts into the global counters.
|
||||
let m = self.global.vec.get_metric(tag);
|
||||
m.inc.count.fetch_add(inc, Relaxed);
|
||||
m.dec.count.fetch_add(dec, Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A, T, Enc> MetricGroup<Enc> for TrackedAllocator<A, T>
|
||||
where
|
||||
T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup,
|
||||
Enc: Encoding,
|
||||
CounterState: MetricEncoding<Enc>,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
|
||||
let global = self.global.get_or_init(AllocCounter::new);
|
||||
|
||||
// iterate over all counter threads
|
||||
for s in self.thread.get().into_iter().flat_map(|s| s.state.iter()) {
|
||||
// iterate over all labels
|
||||
for tag in (0..T::cardinality()).map(T::decode) {
|
||||
sample(global, s.counters.vec.get_metric(tag), tag);
|
||||
}
|
||||
}
|
||||
|
||||
sample(global, &self.default_counters, self.default_tag);
|
||||
|
||||
global.collect_group_into(enc)
|
||||
}
|
||||
}
|
||||
|
||||
fn sample<T: FixedCardinalityLabel + LabelGroup>(
|
||||
global: &AllocCounter<T>,
|
||||
local: &MeasuredCounterPairState,
|
||||
tag: T,
|
||||
) {
|
||||
// load and reset the counts in the thread-local counters.
|
||||
let inc = local.inc.count.swap(0, Relaxed);
|
||||
let dec = local.dec.count.swap(0, Relaxed);
|
||||
|
||||
// add the counts into the global counters.
|
||||
let m = global.vec.get_metric(tag);
|
||||
m.inc.count.fetch_add(inc, Relaxed);
|
||||
m.dec.count.fetch_add(dec, Relaxed);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::alloc::{GlobalAlloc, Layout, System};
|
||||
|
||||
use measured::{FixedCardinalityLabel, MetricGroup, text::BufferedTextEncoder};
|
||||
|
||||
use crate::TrackedAllocator;
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
#[label(singleton = "memory_context")]
|
||||
pub enum MemoryContext {
|
||||
Root,
|
||||
Test,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn alloc() {
|
||||
// Safety: `MemoryContext` upholds the safety requirements.
|
||||
static GLOBAL: TrackedAllocator<System, MemoryContext> =
|
||||
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
|
||||
|
||||
GLOBAL.register_thread();
|
||||
|
||||
let _test = GLOBAL.scope(MemoryContext::Test);
|
||||
|
||||
let ptr = unsafe { GLOBAL.alloc(Layout::for_value(&[0_i32])) };
|
||||
let ptr = unsafe { GLOBAL.realloc(ptr, Layout::for_value(&[0_i32]), 8) };
|
||||
|
||||
drop(_test);
|
||||
|
||||
let ptr = unsafe { GLOBAL.realloc(ptr, Layout::for_value(&[0_i32, 1_i32]), 4) };
|
||||
unsafe { GLOBAL.dealloc(ptr, Layout::for_value(&[0_i32])) };
|
||||
|
||||
let mut text = BufferedTextEncoder::new();
|
||||
GLOBAL.collect_group_into(&mut text).unwrap();
|
||||
let text = String::from_utf8(text.finish().into()).unwrap();
|
||||
assert_eq!(
|
||||
text,
|
||||
r#"# HELP deallocated_bytes total number of bytes deallocated
|
||||
# TYPE deallocated_bytes counter
|
||||
deallocated_bytes{memory_context="root"} 4
|
||||
deallocated_bytes{memory_context="test"} 8
|
||||
|
||||
# HELP allocated_bytes total number of bytes allocated
|
||||
# TYPE allocated_bytes counter
|
||||
allocated_bytes{memory_context="root"} 4
|
||||
allocated_bytes{memory_context="test"} 8
|
||||
"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unregistered_thread() {
|
||||
// Safety: `MemoryContext` upholds the safety requirements.
|
||||
static GLOBAL: TrackedAllocator<System, MemoryContext> =
|
||||
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
|
||||
|
||||
GLOBAL.register_thread();
|
||||
|
||||
// unregistered thread
|
||||
std::thread::spawn(|| {
|
||||
let ptr = unsafe { GLOBAL.alloc(Layout::for_value(&[0_i32])) };
|
||||
unsafe { GLOBAL.dealloc(ptr, Layout::for_value(&[0_i32])) };
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
|
||||
let mut text = BufferedTextEncoder::new();
|
||||
GLOBAL.collect_group_into(&mut text).unwrap();
|
||||
let text = String::from_utf8(text.finish().into()).unwrap();
|
||||
assert_eq!(
|
||||
text,
|
||||
r#"# HELP deallocated_bytes total number of bytes deallocated
|
||||
# TYPE deallocated_bytes counter
|
||||
deallocated_bytes{memory_context="root"} 4
|
||||
deallocated_bytes{memory_context="test"} 0
|
||||
|
||||
# HELP allocated_bytes total number of bytes allocated
|
||||
# TYPE allocated_bytes counter
|
||||
allocated_bytes{memory_context="root"} 4
|
||||
allocated_bytes{memory_context="test"} 0
|
||||
"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fully_unregistered() {
|
||||
// Safety: `MemoryContext` upholds the safety requirements.
|
||||
static GLOBAL: TrackedAllocator<System, MemoryContext> =
|
||||
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
|
||||
|
||||
let ptr = unsafe { GLOBAL.alloc(Layout::for_value(&[0_i32])) };
|
||||
unsafe { GLOBAL.dealloc(ptr, Layout::for_value(&[0_i32])) };
|
||||
|
||||
let mut text = BufferedTextEncoder::new();
|
||||
GLOBAL.collect_group_into(&mut text).unwrap();
|
||||
let text = String::from_utf8(text.finish().into()).unwrap();
|
||||
assert_eq!(
|
||||
text,
|
||||
r#"# HELP deallocated_bytes total number of bytes deallocated
|
||||
# TYPE deallocated_bytes counter
|
||||
deallocated_bytes{memory_context="root"} 4
|
||||
deallocated_bytes{memory_context="test"} 0
|
||||
|
||||
# HELP allocated_bytes total number of bytes allocated
|
||||
# TYPE allocated_bytes counter
|
||||
allocated_bytes{memory_context="root"} 4
|
||||
allocated_bytes{memory_context="test"} 0
|
||||
"#
|
||||
);
|
||||
}
|
||||
}
|
||||
72
libs/alloc-metrics/src/metric_vec.rs
Normal file
72
libs/alloc-metrics/src/metric_vec.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
//! Dense metric vec
|
||||
|
||||
use measured::{
|
||||
FixedCardinalityLabel, LabelGroup,
|
||||
label::StaticLabelSet,
|
||||
metric::{
|
||||
MetricEncoding, MetricFamilyEncoding, MetricType, group::Encoding, name::MetricNameEncoder,
|
||||
},
|
||||
};
|
||||
|
||||
pub struct DenseMetricVec<M: MetricType, L: FixedCardinalityLabel + LabelGroup> {
|
||||
metrics: Box<[M]>,
|
||||
metadata: M::Metadata,
|
||||
_label_set: StaticLabelSet<L>,
|
||||
}
|
||||
|
||||
fn new_dense<M: MetricType>(c: usize) -> Box<[M]> {
|
||||
let mut vec = Vec::with_capacity(c);
|
||||
vec.resize_with(c, M::default);
|
||||
vec.into_boxed_slice()
|
||||
}
|
||||
|
||||
impl<M: MetricType, L: FixedCardinalityLabel + LabelGroup> DenseMetricVec<M, L>
|
||||
where
|
||||
M::Metadata: Default,
|
||||
{
|
||||
/// Create a new metric vec with the given label set and metric metadata
|
||||
pub fn new() -> Self {
|
||||
Self::with_metadata(<M::Metadata>::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl<M: MetricType, L: FixedCardinalityLabel + LabelGroup> DenseMetricVec<M, L> {
|
||||
/// Create a new metric vec with the given label set and metric metadata
|
||||
pub fn with_metadata(metadata: M::Metadata) -> Self {
|
||||
Self {
|
||||
metrics: new_dense(L::cardinality()),
|
||||
metadata,
|
||||
_label_set: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the individual metric at the given identifier.
|
||||
///
|
||||
/// # Panics
|
||||
/// Can panic or cause strange behaviour if the label ID comes from a different metric family.
|
||||
pub fn get_metric(&self, label: L) -> &M {
|
||||
// safety: The caller has guarantees that the label encoding is valid.
|
||||
unsafe { self.metrics.get_unchecked(label.encode()) }
|
||||
}
|
||||
|
||||
/// Get the individual metric at the given identifier.
|
||||
///
|
||||
/// # Panics
|
||||
/// Can panic or cause strange behaviour if the label ID comes from a different metric family.
|
||||
pub fn get_metric_mut(&mut self, label: L) -> &mut M {
|
||||
// safety: The caller has guarantees that the label encoding is valid.
|
||||
unsafe { self.metrics.get_unchecked_mut(label.encode()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<M: MetricEncoding<T>, L: FixedCardinalityLabel + LabelGroup, T: Encoding>
|
||||
MetricFamilyEncoding<T> for DenseMetricVec<M, L>
|
||||
{
|
||||
fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
|
||||
M::write_type(&name, enc)?;
|
||||
for (index, value) in self.metrics.iter().enumerate() {
|
||||
value.collect_into(&self.metadata, L::decode(index), &name, enc)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -46,16 +46,33 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
/// Status of the LFC prewarm process. The same state machine is reused for
|
||||
/// both autoprewarm (prewarm after compute/Postgres start using the previously
|
||||
/// stored LFC state) and explicit prewarming via API.
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
/// Default value when compute boots up.
|
||||
#[default]
|
||||
NotPrewarmed,
|
||||
/// Prewarming thread is active and loading pages into LFC.
|
||||
Prewarming,
|
||||
/// We found requested LFC state in the endpoint storage and
|
||||
/// completed prewarming successfully.
|
||||
Completed,
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
/// Unexpected error happened during prewarming. Note, `Not Found 404`
|
||||
/// response from the endpoint storage is explicitly excluded here
|
||||
/// because it can normally happen on the first compute start,
|
||||
/// since LFC state is not available yet.
|
||||
Failed { error: String },
|
||||
/// We tried to fetch the corresponding LFC state from the endpoint storage,
|
||||
/// but received `Not Found 404`. This should normally happen only during the
|
||||
/// first endpoint start after creation with `autoprewarm: true`.
|
||||
///
|
||||
/// During the orchestrated prewarm via API, when a caller explicitly
|
||||
/// provides the LFC state key to prewarm from, it's the caller responsibility
|
||||
/// to handle this status as an error state in this case.
|
||||
Skipped,
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
@@ -64,6 +81,7 @@ impl Display for LfcPrewarmState {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Skipped => f.write_str("Skipped"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
neon_failpoint.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use neon_failpoint::{configure_failpoint, configure_failpoint_with_context, has_failpoints};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::failpoint_support::apply_failpoint;
|
||||
|
||||
use crate::error::ApiError;
|
||||
use crate::json::{json_request, json_response};
|
||||
@@ -14,16 +13,10 @@ pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in neon_failpoint
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We support actions: "pause", "sleep(N)", "return", "return(value)", "exit", "off", "panic(message)"
|
||||
/// Plus probability-based actions: "N%return(value)", "N%M*return(value)", "N%action", "N%M*action"
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
/// Optional context matching rules for conditional failpoints
|
||||
/// Each key-value pair specifies a context key and a regex pattern to match against
|
||||
/// All context matchers must match for the failpoint to trigger
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub context_matchers: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
@@ -31,7 +24,7 @@ pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !has_failpoints() {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because neon was compiled without failpoints support"
|
||||
)));
|
||||
@@ -39,24 +32,15 @@ pub async fn failpoints_handler(
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
tracing::info!(
|
||||
"cfg failpoint: {} {} (context: {:?})",
|
||||
fp.name,
|
||||
fp.actions,
|
||||
fp.context_matchers
|
||||
);
|
||||
tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
let cfg_result = if let Some(context_matchers) = fp.context_matchers {
|
||||
configure_failpoint_with_context(&fp.name, &fp.actions, context_matchers)
|
||||
} else {
|
||||
configure_failpoint(&fp.name, &fp.actions)
|
||||
};
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err) = cfg_result {
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoint '{}': {}",
|
||||
fp.name,
|
||||
err
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,14 @@
|
||||
//! a default registry.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use std::sync::RwLock;
|
||||
|
||||
use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels};
|
||||
use measured::metric::counter::CounterState;
|
||||
use measured::metric::gauge::GaugeState;
|
||||
use measured::metric::group::Encoding;
|
||||
use measured::metric::name::{MetricName, MetricNameEncoder};
|
||||
use measured::metric::{MetricEncoding, MetricFamilyEncoding};
|
||||
use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
|
||||
use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup};
|
||||
use once_cell::sync::Lazy;
|
||||
use prometheus::Registry;
|
||||
@@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub struct InfoMetric<L: LabelGroup, M: MetricType = GaugeState> {
|
||||
label: RwLock<L>,
|
||||
metric: M,
|
||||
}
|
||||
|
||||
impl<L: LabelGroup> InfoMetric<L> {
|
||||
pub fn new(label: L) -> Self {
|
||||
Self::with_metric(label, GaugeState::new(1))
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
|
||||
pub fn with_metric(label: L, metric: M) -> Self {
|
||||
Self {
|
||||
label: RwLock::new(label),
|
||||
metric,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_label(&self, label: L) {
|
||||
*self.label.write().unwrap() = label;
|
||||
}
|
||||
}
|
||||
|
||||
impl<L, M, E> MetricFamilyEncoding<E> for InfoMetric<L, M>
|
||||
where
|
||||
L: LabelGroup,
|
||||
M: MetricEncoding<E, Metadata = ()>,
|
||||
E: Encoding,
|
||||
{
|
||||
fn collect_family_into(
|
||||
&self,
|
||||
name: impl measured::metric::name::MetricNameEncoder,
|
||||
enc: &mut E,
|
||||
) -> Result<(), E::Err> {
|
||||
M::write_type(&name, enc)?;
|
||||
self.metric
|
||||
.collect_into(&(), &*self.label.read().unwrap(), name, enc)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BuildInfo {
|
||||
pub revision: &'static str,
|
||||
pub build_tag: &'static str,
|
||||
}
|
||||
|
||||
// todo: allow label group without the set
|
||||
impl LabelGroup for BuildInfo {
|
||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||
const REVISION: &LabelName = LabelName::from_str("revision");
|
||||
@@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
|
||||
where
|
||||
GaugeState: MetricEncoding<T>,
|
||||
{
|
||||
fn collect_family_into(
|
||||
&self,
|
||||
name: impl measured::metric::name::MetricNameEncoder,
|
||||
enc: &mut T,
|
||||
) -> Result<(), T::Err> {
|
||||
enc.write_help(&name, "Build/version information")?;
|
||||
GaugeState::write_type(&name, enc)?;
|
||||
GaugeState {
|
||||
count: std::sync::atomic::AtomicI64::new(1),
|
||||
}
|
||||
.collect_into(&(), self, name, enc)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(build_info: BuildInfo))]
|
||||
pub struct NeonMetrics {
|
||||
@@ -165,8 +189,8 @@ pub struct NeonMetrics {
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(build_info: BuildInfo))]
|
||||
pub struct LibMetrics {
|
||||
#[metric(init = build_info)]
|
||||
build_info: BuildInfo,
|
||||
#[metric(init = InfoMetric::new(build_info))]
|
||||
build_info: InfoMetric<BuildInfo>,
|
||||
|
||||
#[metric(flatten)]
|
||||
rusage: Rusage,
|
||||
@@ -454,7 +478,7 @@ pub trait CounterPairAssoc {
|
||||
}
|
||||
|
||||
pub struct CounterPairVec<A: CounterPairAssoc> {
|
||||
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
pub vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
|
||||
@@ -468,6 +492,17 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> CounterPairVec<A>
|
||||
where
|
||||
A::LabelGroupSet: Default,
|
||||
{
|
||||
pub fn dense() -> Self {
|
||||
Self {
|
||||
vec: measured::metric::MetricVec::dense(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
pub fn guard(
|
||||
&self,
|
||||
@@ -477,14 +512,31 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
MeasuredCounterPairGuard { vec: &self.vec, id }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).dec.inc();
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn inc_by(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>, x: u64) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc_by(x);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn dec_by(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>, x: u64) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).dec.inc_by(x);
|
||||
}
|
||||
|
||||
pub fn remove_metric(
|
||||
&self,
|
||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||
@@ -529,6 +581,28 @@ pub struct MeasuredCounterPairState {
|
||||
pub dec: CounterState,
|
||||
}
|
||||
|
||||
impl MeasuredCounterPairState {
|
||||
#[inline]
|
||||
pub fn inc(&self) {
|
||||
self.inc.inc();
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn dec(&self) {
|
||||
self.dec.inc();
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn inc_by(&self, x: u64) {
|
||||
self.inc.inc_by(x);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn dec_by(&self, x: u64) {
|
||||
self.dec.inc_by(x);
|
||||
}
|
||||
}
|
||||
|
||||
impl measured::metric::MetricType for MeasuredCounterPairState {
|
||||
type Metadata = ();
|
||||
}
|
||||
@@ -545,9 +619,9 @@ impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
|
||||
}
|
||||
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
|
||||
struct Inc<T>(T);
|
||||
pub struct Inc<T>(pub T);
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
|
||||
struct Dec<T>(T);
|
||||
pub struct Dec<T>(pub T);
|
||||
|
||||
impl<T: Encoding> Encoding for Inc<T> {
|
||||
type Err = T::Err;
|
||||
|
||||
@@ -8,6 +8,13 @@ license.workspace = true
|
||||
thiserror.workspace = true
|
||||
nix.workspace=true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
libc.workspace = true
|
||||
lock_api.workspace = true
|
||||
rustc-hash.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9"
|
||||
rand_distr = "0.5.1"
|
||||
|
||||
583
libs/neon-shmem/src/hash.rs
Normal file
583
libs/neon-shmem/src/hash.rs
Normal file
@@ -0,0 +1,583 @@
|
||||
//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
|
||||
//!
|
||||
//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
|
||||
//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an
|
||||
//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
|
||||
//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
|
||||
//!
|
||||
//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
|
||||
//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
|
||||
//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
|
||||
//! off of the freelist, and then the index of said bucket is placed in the dictionary.
|
||||
//!
|
||||
//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
|
||||
//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
|
||||
//! dictionary by rehashing all keys.
|
||||
//!
|
||||
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
|
||||
|
||||
use std::hash::{BuildHasher, Hash};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
use crate::{shmem, sync::*};
|
||||
|
||||
mod core;
|
||||
pub mod entry;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use core::{Bucket, CoreHashMap, INVALID_POS};
|
||||
use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// Error type for a hashmap shrink operation.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum HashMapShrinkError {
|
||||
/// There was an error encountered while resizing the memory area.
|
||||
#[error("shmem resize failed: {0}")]
|
||||
ResizeError(shmem::Error),
|
||||
/// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
|
||||
#[error("occupied entry in deallocated space found at {0}")]
|
||||
RemainingEntries(usize),
|
||||
}
|
||||
|
||||
/// This represents a hash table that (possibly) lives in shared memory.
|
||||
/// If a new process is launched with fork(), the child process inherits
|
||||
/// this struct.
|
||||
#[must_use]
|
||||
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
shared_size: usize,
|
||||
hasher: S,
|
||||
num_buckets: u32,
|
||||
}
|
||||
|
||||
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
|
||||
/// If a child process is launched with fork(), the child process should
|
||||
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
|
||||
///
|
||||
/// XXX: We're not making use of it at the moment, but this struct could
|
||||
/// hold process-local information in the future.
|
||||
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
hasher: S,
|
||||
}
|
||||
|
||||
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
|
||||
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
|
||||
|
||||
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
|
||||
/// Change the 'hasher' used by the hash table.
|
||||
///
|
||||
/// NOTE: This must be called right after creating the hash table,
|
||||
/// before inserting any entries and before calling attach_writer/reader.
|
||||
/// Otherwise different accessors could be using different hash function,
|
||||
/// with confusing results.
|
||||
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
|
||||
HashMapInit {
|
||||
hasher,
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
shared_size: self.shared_size,
|
||||
num_buckets: self.num_buckets,
|
||||
}
|
||||
}
|
||||
|
||||
/// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
fn new(
|
||||
num_buckets: u32,
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
area_ptr: *mut u8,
|
||||
area_size: usize,
|
||||
hasher: S,
|
||||
) -> Self {
|
||||
let mut ptr: *mut u8 = area_ptr;
|
||||
let end_ptr: *mut u8 = unsafe { ptr.add(area_size) };
|
||||
|
||||
// carve out area for the One Big Lock (TM) and the HashMapShared.
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
|
||||
let raw_lock_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
|
||||
let buckets_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_ptr = ptr;
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
};
|
||||
|
||||
let hashmap = CoreHashMap::new(buckets, dictionary);
|
||||
unsafe {
|
||||
let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
|
||||
std::ptr::write(shared_ptr, lock);
|
||||
}
|
||||
|
||||
Self {
|
||||
num_buckets,
|
||||
shmem_handle,
|
||||
shared_ptr,
|
||||
shared_size: area_size,
|
||||
hasher,
|
||||
}
|
||||
}
|
||||
|
||||
/// Attach to a hash table for writing.
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
hasher: self.hasher,
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
|
||||
///
|
||||
/// This is a holdover from a previous implementation and is being kept around for
|
||||
/// backwards compatibility reasons.
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
/// Hash table data that is actually stored in the shared memory area.
|
||||
///
|
||||
/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
|
||||
/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
|
||||
/// area as follows:
|
||||
///
|
||||
/// [`libc::pthread_rwlock_t`]
|
||||
/// [`HashMapShared`]
|
||||
/// buckets
|
||||
/// dictionary
|
||||
///
|
||||
/// In between the above parts, there can be padding bytes to align the parts correctly.
|
||||
type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
/// Place the hash table within a user-supplied fixed memory area.
|
||||
pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
|
||||
Self::new(
|
||||
num_buckets,
|
||||
None,
|
||||
area.as_mut_ptr().cast(),
|
||||
area.len(),
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Place a new hash map in the given shared memory area
|
||||
///
|
||||
/// # Panics
|
||||
/// Will panic on failure to resize area to expected map size.
|
||||
pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
let ptr = shmem.data_ptr.as_ptr().cast();
|
||||
Self::new(
|
||||
num_buckets,
|
||||
Some(shmem),
|
||||
ptr,
|
||||
size,
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Make a resizable hash map within a new shared memory area with the given name.
|
||||
pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
let max_size = Self::estimate_size(max_buckets);
|
||||
let shmem =
|
||||
ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area");
|
||||
let ptr = shmem.data_ptr.as_ptr().cast();
|
||||
|
||||
Self::new(
|
||||
num_buckets,
|
||||
Some(shmem),
|
||||
ptr,
|
||||
size,
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Make a resizable hash map within a new anonymous shared memory area.
|
||||
pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
static COUNTER: AtomicUsize = AtomicUsize::new(0);
|
||||
let val = COUNTER.fetch_add(1, Ordering::Relaxed);
|
||||
let name = format!("neon_shmem_hmap{val}");
|
||||
Self::new_resizeable_named(num_buckets, max_buckets, &name)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
/// Hash a key using the map's hasher.
|
||||
#[inline]
|
||||
fn get_hash_value(&self, key: &K) -> u64 {
|
||||
self.hasher.hash_one(key)
|
||||
}
|
||||
|
||||
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
|
||||
let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
|
||||
let dict_pos = hash as usize % map.dictionary.len();
|
||||
let first = map.dictionary[dict_pos];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
|
||||
let mut prev_pos = PrevPos::First(dict_pos as u32);
|
||||
let mut next = first;
|
||||
loop {
|
||||
let bucket = &mut map.buckets[next as usize];
|
||||
let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if *bucket_key == key {
|
||||
// found existing entry
|
||||
return Entry::Occupied(OccupiedEntry {
|
||||
map,
|
||||
_key: key,
|
||||
prev_pos,
|
||||
bucket_pos: next,
|
||||
});
|
||||
}
|
||||
|
||||
if bucket.next == INVALID_POS {
|
||||
// No existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
prev_pos = PrevPos::Chained(next);
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a reference to the corresponding value for a key.
|
||||
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
|
||||
let hash = self.get_hash_value(key);
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
|
||||
}
|
||||
|
||||
/// Get a reference to the entry containing a key.
|
||||
///
|
||||
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
|
||||
/// is to use the entry for reading or for writing in advance.
|
||||
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
|
||||
let hash = self.get_hash_value(&key);
|
||||
self.entry_with_hash(key, hash)
|
||||
}
|
||||
|
||||
/// Remove a key given its hash. Returns the associated value if it existed.
|
||||
pub fn remove(&self, key: &K) -> Option<V> {
|
||||
let hash = self.get_hash_value(key);
|
||||
match self.entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert/update a key. Returns the previous associated value if it existed.
|
||||
///
|
||||
/// # Errors
|
||||
/// Will return [`core::FullError`] if there is no more space left in the map.
|
||||
pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
|
||||
let hash = self.get_hash_value(&key);
|
||||
match self.entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(value)?;
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Optionally return the entry for a bucket at a given index if it exists.
|
||||
///
|
||||
/// Has more overhead than one would intuitively expect: performs both a clone of the key
|
||||
/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
|
||||
/// to enable repairing the hash chain if the entry is removed.
|
||||
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
if pos >= map.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let entry = map.buckets[pos].inner.as_ref();
|
||||
match entry {
|
||||
Some((key, _)) => Some(OccupiedEntry {
|
||||
_key: key.clone(),
|
||||
bucket_pos: pos as u32,
|
||||
prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)),
|
||||
map,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of buckets in the table.
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
map.get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map.
|
||||
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
|
||||
// _slowly_ iterate through all buckets with its clock hand, without holding a lock.
|
||||
// If we switch to an Iterator, it must not hold the lock.
|
||||
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
if pos >= map.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
|
||||
}
|
||||
|
||||
/// Returns the index of the bucket a given value corresponds to.
|
||||
pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
|
||||
let origin = map.buckets.as_ptr();
|
||||
let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
|
||||
assert!(idx < map.buckets.len());
|
||||
|
||||
idx
|
||||
}
|
||||
|
||||
/// Returns the number of occupied buckets in the table.
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
map.buckets_in_use as usize
|
||||
}
|
||||
|
||||
/// Clears all entries in a table. Does not reset any shrinking operations.
|
||||
pub fn clear(&self) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
map.clear();
|
||||
}
|
||||
|
||||
/// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
|
||||
/// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
|
||||
/// in the process.
|
||||
fn rehash_dict(
|
||||
&self,
|
||||
inner: &mut CoreHashMap<'a, K, V>,
|
||||
buckets_ptr: *mut core::Bucket<K, V>,
|
||||
end_ptr: *mut u8,
|
||||
num_buckets: u32,
|
||||
rehash_buckets: u32,
|
||||
) {
|
||||
inner.free_head = INVALID_POS;
|
||||
|
||||
let buckets;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for e in dictionary.iter_mut() {
|
||||
*e = INVALID_POS;
|
||||
}
|
||||
|
||||
for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
|
||||
if bucket.inner.is_none() {
|
||||
bucket.next = inner.free_head;
|
||||
inner.free_head = i as u32;
|
||||
continue;
|
||||
}
|
||||
|
||||
let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
|
||||
let pos: usize = (hash % dictionary.len() as u64) as usize;
|
||||
bucket.next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
inner.dictionary = dictionary;
|
||||
inner.buckets = buckets;
|
||||
}
|
||||
|
||||
/// Rehash the map without growing or shrinking.
|
||||
pub fn shuffle(&self) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
let num_buckets = map.get_num_buckets() as u32;
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
|
||||
}
|
||||
|
||||
/// Grow the number of buckets within the table.
|
||||
///
|
||||
/// 1. Grows the underlying shared memory area
|
||||
/// 2. Initializes new buckets and overwrites the current dictionary
|
||||
/// 3. Rehashes the dictionary
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
|
||||
pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
let old_num_buckets = map.buckets.len() as u32;
|
||||
|
||||
assert!(
|
||||
num_buckets >= old_num_buckets,
|
||||
"grow called with a smaller number of buckets"
|
||||
);
|
||||
if num_buckets == old_num_buckets {
|
||||
return Ok(());
|
||||
}
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("grow called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// Initialize new buckets. The new buckets are linked to the free list.
|
||||
// NB: This overwrites the dictionary!
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
unsafe {
|
||||
for i in old_num_buckets..num_buckets {
|
||||
let bucket = buckets_ptr.add(i as usize);
|
||||
bucket.write(core::Bucket {
|
||||
next: if i < num_buckets - 1 {
|
||||
i + 1
|
||||
} else {
|
||||
map.free_head
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
|
||||
map.free_head = old_num_buckets;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
|
||||
/// greater than the number of buckets in the map.
|
||||
pub fn begin_shrink(&mut self, num_buckets: u32) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
assert!(
|
||||
num_buckets <= map.get_num_buckets() as u32,
|
||||
"shrink called with a larger number of buckets"
|
||||
);
|
||||
_ = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("shrink called on a fixed-size hash table");
|
||||
map.alloc_limit = num_buckets;
|
||||
}
|
||||
|
||||
/// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
|
||||
pub fn shrink_goal(&self) -> Option<usize> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
|
||||
let goal = map.alloc_limit;
|
||||
if goal == INVALID_POS {
|
||||
None
|
||||
} else {
|
||||
Some(goal as usize)
|
||||
}
|
||||
}
|
||||
|
||||
/// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
|
||||
///
|
||||
/// # Panics
|
||||
/// The following cases result in a panic:
|
||||
/// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
|
||||
/// - Calling this function on a map when no shrink operation is in progress.
|
||||
pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
assert!(
|
||||
map.alloc_limit != INVALID_POS,
|
||||
"called finish_shrink when no shrink is in progress"
|
||||
);
|
||||
|
||||
let num_buckets = map.alloc_limit;
|
||||
|
||||
if map.get_num_buckets() == num_buckets as usize {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
assert!(
|
||||
map.buckets_in_use <= num_buckets,
|
||||
"called finish_shrink before enough entries were removed"
|
||||
);
|
||||
|
||||
for i in (num_buckets as usize)..map.buckets.len() {
|
||||
if map.buckets[i].inner.is_some() {
|
||||
return Err(HashMapShrinkError::RemainingEntries(i));
|
||||
}
|
||||
}
|
||||
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("shrink called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
if let Err(e) = shmem_handle.set_size(size_bytes) {
|
||||
return Err(HashMapShrinkError::ResizeError(e));
|
||||
}
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
|
||||
map.alloc_limit = INVALID_POS;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
174
libs/neon-shmem/src/hash/core.rs
Normal file
174
libs/neon-shmem/src/hash/core.rs
Normal file
@@ -0,0 +1,174 @@
|
||||
//! Simple hash table with chaining.
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::hash::entry::*;
|
||||
|
||||
/// Invalid position within the map (either within the dictionary or bucket array).
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
|
||||
/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
|
||||
pub(crate) struct Bucket<K, V> {
|
||||
/// Index of next bucket in the chain.
|
||||
pub(crate) next: u32,
|
||||
/// Key-value pair contained within bucket.
|
||||
pub(crate) inner: Option<(K, V)>,
|
||||
}
|
||||
|
||||
/// Core hash table implementation.
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
/// Dictionary used to map hashes to bucket indices.
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
/// Buckets containing key-value pairs.
|
||||
pub(crate) buckets: &'a mut [Bucket<K, V>],
|
||||
/// Head of the freelist.
|
||||
pub(crate) free_head: u32,
|
||||
/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
|
||||
pub(crate) alloc_limit: u32,
|
||||
/// The number of currently occupied buckets.
|
||||
pub(crate) buckets_in_use: u32,
|
||||
}
|
||||
|
||||
/// Error for when there are no empty buckets left but one is needed.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct FullError;
|
||||
|
||||
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
/// Estimate the size of data contained within the the hash map.
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += size_of::<Bucket<K, V>>() * num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
|
||||
dictionary: &'a mut [MaybeUninit<u32>],
|
||||
) -> Self {
|
||||
// Initialize the buckets
|
||||
for i in 0..buckets.len() {
|
||||
buckets[i].write(Bucket {
|
||||
next: if i < buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Initialize the dictionary
|
||||
for e in dictionary.iter_mut() {
|
||||
e.write(INVALID_POS);
|
||||
}
|
||||
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
|
||||
};
|
||||
|
||||
Self {
|
||||
dictionary,
|
||||
buckets,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
alloc_limit: INVALID_POS,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the value associated with a key (if it exists) given its hash.
|
||||
pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bucket = &self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(bucket_value);
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get number of buckets in map.
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.buckets.len()
|
||||
}
|
||||
|
||||
/// Clears all entries from the hashmap.
|
||||
///
|
||||
/// Does not reset any allocation limits, but does clear any entries beyond them.
|
||||
pub fn clear(&mut self) {
|
||||
for i in 0..self.buckets.len() {
|
||||
self.buckets[i] = Bucket {
|
||||
next: if i < self.buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
}
|
||||
}
|
||||
for i in 0..self.dictionary.len() {
|
||||
self.dictionary[i] = INVALID_POS;
|
||||
}
|
||||
|
||||
self.free_head = 0;
|
||||
self.buckets_in_use = 0;
|
||||
}
|
||||
|
||||
/// Find the position of an unused bucket via the freelist and initialize it.
|
||||
pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
|
||||
let mut pos = self.free_head;
|
||||
|
||||
// Find the first bucket we're *allowed* to use.
|
||||
let mut prev = PrevPos::First(self.free_head);
|
||||
while pos != INVALID_POS && pos >= self.alloc_limit {
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
prev = PrevPos::Chained(pos);
|
||||
pos = bucket.next;
|
||||
}
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError);
|
||||
}
|
||||
|
||||
// Repair the freelist.
|
||||
match prev {
|
||||
PrevPos::First(_) => {
|
||||
let next_pos = self.buckets[pos as usize].next;
|
||||
self.free_head = next_pos;
|
||||
}
|
||||
PrevPos::Chained(p) => {
|
||||
if p != INVALID_POS {
|
||||
let next_pos = self.buckets[pos as usize].next;
|
||||
self.buckets[p as usize].next = next_pos;
|
||||
}
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// Initialize the bucket.
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
self.buckets_in_use += 1;
|
||||
bucket.next = INVALID_POS;
|
||||
bucket.inner = Some((key, value));
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
130
libs/neon-shmem/src/hash/entry.rs
Normal file
130
libs/neon-shmem/src/hash/entry.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
|
||||
|
||||
use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
|
||||
use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::mem;
|
||||
|
||||
pub enum Entry<'a, 'b, K, V> {
|
||||
Occupied(OccupiedEntry<'a, 'b, K, V>),
|
||||
Vacant(VacantEntry<'a, 'b, K, V>),
|
||||
}
|
||||
|
||||
/// Enum representing the previous position within a chain.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum PrevPos {
|
||||
/// Starting index within the dictionary.
|
||||
First(u32),
|
||||
/// Regular index within the buckets.
|
||||
Chained(u32),
|
||||
/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
|
||||
Unknown(u64),
|
||||
}
|
||||
|
||||
pub struct OccupiedEntry<'a, 'b, K, V> {
|
||||
/// Mutable reference to the map containing this entry.
|
||||
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
|
||||
/// The key of the occupied entry
|
||||
pub(crate) _key: K,
|
||||
/// The index of the previous entry in the chain.
|
||||
pub(crate) prev_pos: PrevPos,
|
||||
/// The position of the bucket in the [`CoreHashMap`] bucket array.
|
||||
pub(crate) bucket_pos: u32,
|
||||
}
|
||||
|
||||
impl<K, V> OccupiedEntry<'_, '_, K, V> {
|
||||
pub fn get(&self) -> &V {
|
||||
&self.map.buckets[self.bucket_pos as usize]
|
||||
.inner
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.1
|
||||
}
|
||||
|
||||
pub fn get_mut(&mut self) -> &mut V {
|
||||
&mut self.map.buckets[self.bucket_pos as usize]
|
||||
.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.1
|
||||
}
|
||||
|
||||
/// Inserts a value into the entry, replacing (and returning) the existing value.
|
||||
pub fn insert(&mut self, value: V) -> V {
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
// This assumes inner is Some, which it must be for an OccupiedEntry
|
||||
mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
|
||||
}
|
||||
|
||||
/// Removes the entry from the hash map, returning the value originally stored within it.
|
||||
///
|
||||
/// This may result in multiple bucket accesses if the entry was obtained by index as the
|
||||
/// previous chain entry needs to be discovered in this case.
|
||||
pub fn remove(mut self) -> V {
|
||||
// If this bucket was queried by index, go ahead and follow its chain from the start.
|
||||
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
|
||||
let dict_idx = hash as usize % self.map.dictionary.len();
|
||||
let mut prev = PrevPos::First(dict_idx as u32);
|
||||
let mut curr = self.map.dictionary[dict_idx];
|
||||
while curr != self.bucket_pos {
|
||||
assert!(curr != INVALID_POS);
|
||||
prev = PrevPos::Chained(curr);
|
||||
curr = self.map.buckets[curr as usize].next;
|
||||
}
|
||||
prev
|
||||
} else {
|
||||
self.prev_pos
|
||||
};
|
||||
|
||||
// CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
|
||||
// unlink it from the chain
|
||||
match prev {
|
||||
PrevPos::First(dict_pos) => {
|
||||
self.map.dictionary[dict_pos as usize] = bucket.next;
|
||||
}
|
||||
PrevPos::Chained(bucket_pos) => {
|
||||
self.map.buckets[bucket_pos as usize].next = bucket.next;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let free = self.map.free_head;
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
let old_value = bucket.inner.take();
|
||||
bucket.next = free;
|
||||
self.map.free_head = self.bucket_pos;
|
||||
self.map.buckets_in_use -= 1;
|
||||
|
||||
old_value.unwrap().1
|
||||
}
|
||||
}
|
||||
|
||||
/// An abstract view into a vacant entry within the map.
|
||||
pub struct VacantEntry<'a, 'b, K, V> {
|
||||
/// Mutable reference to the map containing this entry.
|
||||
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
|
||||
/// The key to be inserted into this entry.
|
||||
pub(crate) key: K,
|
||||
/// The position within the dictionary corresponding to the key's hash.
|
||||
pub(crate) dict_pos: u32,
|
||||
}
|
||||
|
||||
impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
|
||||
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
|
||||
///
|
||||
/// # Errors
|
||||
/// Will return [`FullError`] if there are no unoccupied buckets in the map.
|
||||
pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
|
||||
let pos = self.map.alloc_bucket(self.key, value)?;
|
||||
self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
|
||||
self.map.dictionary[self.dict_pos as usize] = pos;
|
||||
|
||||
Ok(RwLockWriteGuard::map(self.map, |m| {
|
||||
&mut m.buckets[pos as usize].inner.as_mut().unwrap().1
|
||||
}))
|
||||
}
|
||||
}
|
||||
428
libs/neon-shmem/src/hash/tests.rs
Normal file
428
libs/neon-shmem/src/hash/tests.rs
Normal file
@@ -0,0 +1,428 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::hash::Entry;
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::core::FullError;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_inserts")
|
||||
.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let res = w.entry((*k).into());
|
||||
match res {
|
||||
Entry::Occupied(mut e) => {
|
||||
e.insert(idx);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let res = e.insert(idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let x = w.get(&(*k).into());
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
map: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
let entry = map.entry(op.0);
|
||||
let hash_existing = match op.1 {
|
||||
Some(new) => match entry {
|
||||
Entry::Occupied(mut e) => Some(e.insert(new)),
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(new).unwrap();
|
||||
None
|
||||
}
|
||||
},
|
||||
None => match entry {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
},
|
||||
};
|
||||
|
||||
assert_eq!(shadow_existing, hash_existing);
|
||||
}
|
||||
|
||||
fn do_random_ops(
|
||||
num_ops: usize,
|
||||
size: u32,
|
||||
del_prob: f64,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
rng: &mut rand::rngs::ThreadRng,
|
||||
) {
|
||||
for i in 0..num_ops {
|
||||
let key: TestKey = ((rng.next_u32() % size) as u128).into();
|
||||
let op = TestOp(
|
||||
key,
|
||||
if rng.random_bool(del_prob) {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
apply_op(&op, writer, shadow);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_deletes(
|
||||
num_ops: usize,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
for _ in 0..num_ops {
|
||||
let (k, _) = shadow.pop_first().unwrap();
|
||||
writer.remove(&k);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_shrink(
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
from: u32,
|
||||
to: u32,
|
||||
) {
|
||||
assert!(writer.shrink_goal().is_none());
|
||||
writer.begin_shrink(to);
|
||||
assert_eq!(writer.shrink_goal(), Some(to as usize));
|
||||
for i in to..from {
|
||||
if let Some(entry) = writer.entry_at_bucket(i as usize) {
|
||||
shadow.remove(&entry._key);
|
||||
entry.remove();
|
||||
}
|
||||
}
|
||||
let old_usage = writer.get_num_buckets_in_use();
|
||||
writer.finish_shrink().unwrap();
|
||||
assert!(writer.shrink_goal().is_none());
|
||||
assert_eq!(writer.get_num_buckets_in_use(), old_usage);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_random")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &mut writer, &mut shadow);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shuffle() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_shuf")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.shuffle();
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, "test_grow")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
let old_usage = writer.get_num_buckets_in_use();
|
||||
writer.grow(1500).unwrap();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), old_usage);
|
||||
assert_eq!(writer.get_num_buckets(), 1500);
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clear() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.clear();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 0);
|
||||
assert_eq!(writer.get_num_buckets(), 1500);
|
||||
while let Some((key, _)) = shadow.pop_first() {
|
||||
assert!(writer.get(&key).is_none());
|
||||
}
|
||||
do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
for i in 0..(1500 - writer.get_num_buckets_in_use()) {
|
||||
writer.insert((1500 + i as u128).into(), 0).unwrap();
|
||||
}
|
||||
assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
|
||||
writer.clear();
|
||||
assert!(writer.insert(5000.into(), 0).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_idx_remove() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
for _ in 0..100 {
|
||||
let idx = (rng.next_u32() % 1500) as usize;
|
||||
if let Some(e) = writer.entry_at_bucket(idx) {
|
||||
shadow.remove(&e._key);
|
||||
e.remove();
|
||||
}
|
||||
}
|
||||
while let Some((key, val)) = shadow.pop_first() {
|
||||
assert_eq!(*writer.get(&key).unwrap(), val);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_idx_get() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
for _ in 0..100 {
|
||||
let idx = (rng.next_u32() % 1500) as usize;
|
||||
if let Some(pair) = writer.get_at_bucket(idx) {
|
||||
{
|
||||
let v: *const usize = &pair.1;
|
||||
assert_eq!(writer.get_bucket_for_value(v), idx);
|
||||
}
|
||||
{
|
||||
let v: *const usize = &pair.1;
|
||||
assert_eq!(writer.get_bucket_for_value(v), idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
do_shrink(&mut writer, &mut shadow, 1500, 1000);
|
||||
assert_eq!(writer.get_num_buckets(), 1000);
|
||||
do_deletes(500, &mut writer, &mut shadow);
|
||||
do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
assert!(writer.get_num_buckets_in_use() <= 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_grow_seq() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 750");
|
||||
do_shrink(&mut writer, &mut shadow, 1000, 750);
|
||||
do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 1500");
|
||||
writer.grow(1500).unwrap();
|
||||
do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 200");
|
||||
while shadow.len() > 100 {
|
||||
do_deletes(1, &mut writer, &mut shadow);
|
||||
}
|
||||
do_shrink(&mut writer, &mut shadow, 1500, 200);
|
||||
do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 10k");
|
||||
writer.grow(10000).unwrap();
|
||||
do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bucket_ops() {
|
||||
let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_bucket_ops")
|
||||
.attach_writer();
|
||||
match writer.entry(1.into()) {
|
||||
Entry::Occupied(mut e) => {
|
||||
e.insert(2);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(2).unwrap();
|
||||
}
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
assert_eq!(writer.get_num_buckets(), 1000);
|
||||
assert_eq!(*writer.get(&1.into()).unwrap(), 2);
|
||||
let pos = match writer.entry(1.into()) {
|
||||
Entry::Occupied(e) => {
|
||||
assert_eq!(e._key, 1.into());
|
||||
e.bucket_pos as usize
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
panic!("Insert didn't affect entry");
|
||||
}
|
||||
};
|
||||
assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
|
||||
assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
|
||||
{
|
||||
let ptr: *const usize = &*writer.get(&1.into()).unwrap();
|
||||
assert_eq!(writer.get_bucket_for_value(ptr), pos);
|
||||
}
|
||||
writer.remove(&1.into());
|
||||
assert!(writer.get(&1.into()).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_zero() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
|
||||
.attach_writer();
|
||||
writer.begin_shrink(0);
|
||||
for i in 0..1500 {
|
||||
writer.entry_at_bucket(i).map(|x| x.remove());
|
||||
}
|
||||
writer.finish_shrink().unwrap();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 0);
|
||||
let entry = writer.entry(1.into());
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_err());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
writer.grow(50).unwrap();
|
||||
let entry = writer.entry(1.into());
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_ok());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_grow_oom() {
|
||||
let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
|
||||
.attach_writer();
|
||||
writer.grow(20000).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_bigger() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
|
||||
.attach_writer();
|
||||
writer.begin_shrink(2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_early_finish() {
|
||||
let writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
|
||||
.attach_writer();
|
||||
writer.finish_shrink().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_fixed_size() {
|
||||
let mut area = [MaybeUninit::uninit(); 10000];
|
||||
let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
|
||||
let mut writer = init_struct.attach_writer();
|
||||
writer.begin_shrink(1);
|
||||
}
|
||||
@@ -1,418 +1,3 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {max_size} too large");
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {i}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
pub mod sync;
|
||||
|
||||
409
libs/neon-shmem/src/shmem.rs
Normal file
409
libs/neon-shmem/src/shmem.rs
Normal file
@@ -0,0 +1,409 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
|
||||
/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the [`ShmemHandle`] functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Self {
|
||||
Self {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
|
||||
///
|
||||
/// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
|
||||
// We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
assert!(max_size < 1 << 48, "max size {max_size} too large");
|
||||
|
||||
assert!(
|
||||
initial_size <= max_size,
|
||||
"initial size {initial_size} larger than max size {max_size}"
|
||||
);
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
});
|
||||
}
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(Self {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an [`shmem::Error`](Error).
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
assert!(
|
||||
new_size <= self.max_size,
|
||||
"new size ({new_size}) is greater than max size ({})",
|
||||
self.max_size
|
||||
);
|
||||
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in `current_size`
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry.
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64)
|
||||
.map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
|
||||
/// It is the caller's responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// Disable unused variables warnings because `name` is unused in the macos path.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {i}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
111
libs/neon-shmem/src/sync.rs
Normal file
111
libs/neon-shmem/src/sync.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ptr::NonNull;
|
||||
|
||||
use nix::errno::Errno;
|
||||
|
||||
pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
|
||||
pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
|
||||
pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
|
||||
pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
|
||||
pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
|
||||
|
||||
/// Shared memory read-write lock.
|
||||
pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
|
||||
|
||||
/// Simple macro that calls a function in the libc namespace and panics if return value is nonzero.
|
||||
macro_rules! libc_checked {
|
||||
($fn_name:ident ( $($arg:expr),* )) => {{
|
||||
let res = libc::$fn_name($($arg),*);
|
||||
if res != 0 {
|
||||
panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res));
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
impl PthreadRwLock {
|
||||
/// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock.
|
||||
///
|
||||
/// # Safety
|
||||
/// `lock` must be non-null. Every unsafe operation will panic in the event of an error.
|
||||
pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
|
||||
unsafe {
|
||||
let mut attrs = MaybeUninit::uninit();
|
||||
libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr()));
|
||||
libc_checked!(pthread_rwlockattr_setpshared(
|
||||
attrs.as_mut_ptr(),
|
||||
libc::PTHREAD_PROCESS_SHARED
|
||||
));
|
||||
libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr()));
|
||||
// Safety: POSIX specifies that "any function affecting the attributes
|
||||
// object (including destruction) shall not affect any previously
|
||||
// initialized read-write locks".
|
||||
libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr()));
|
||||
Self(Some(NonNull::new_unchecked(lock)))
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
|
||||
match self.0 {
|
||||
None => {
|
||||
panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
|
||||
}
|
||||
Some(x) => x,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl lock_api::RawRwLock for PthreadRwLock {
|
||||
type GuardMarker = lock_api::GuardSend;
|
||||
const INIT: Self = Self(None);
|
||||
|
||||
fn try_lock_shared(&self) -> bool {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
|
||||
match res {
|
||||
0 => true,
|
||||
libc::EAGAIN => false,
|
||||
_ => panic!(
|
||||
"pthread_rwlock_tryrdlock failed with {}",
|
||||
Errno::from_raw(res)
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_lock_exclusive(&self) -> bool {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
|
||||
match res {
|
||||
0 => true,
|
||||
libc::EAGAIN => false,
|
||||
_ => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn lock_shared(&self) {
|
||||
unsafe {
|
||||
libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr()));
|
||||
}
|
||||
}
|
||||
|
||||
fn lock_exclusive(&self) {
|
||||
unsafe {
|
||||
libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr()));
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlock_exclusive(&self) {
|
||||
unsafe {
|
||||
libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlock_shared(&self) {
|
||||
unsafe {
|
||||
libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
[package]
|
||||
name = "neon_failpoint"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tokio = { workspace = true, features = ["time", "sync", "rt-multi-thread"] }
|
||||
tokio-util = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
anyhow = { workspace = true }
|
||||
regex = { workspace = true }
|
||||
once_cell = { workspace = true }
|
||||
parking_lot = { workspace = true }
|
||||
rand = { workspace = true }
|
||||
either = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tracing-subscriber = { workspace = true, features = ["fmt"] }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
testing = []
|
||||
|
||||
[[example]]
|
||||
name = "context_demo"
|
||||
required-features = ["testing"]
|
||||
@@ -1,460 +0,0 @@
|
||||
# Neon Failpoint Library
|
||||
|
||||
A modern, async-first failpoint library for Neon, replacing the `fail` crate with enhanced functionality.
|
||||
|
||||
## Features
|
||||
|
||||
- **Async-first**: All failpoint operations are async and don't require `spawn_blocking`
|
||||
- **Context matching**: Failpoints can be configured to trigger only when specific context conditions are met
|
||||
- **Regex support**: Context values can be matched using regular expressions
|
||||
- **Cancellation support**: All operations support cancellation tokens
|
||||
- **Dynamic reconfiguration**: Paused and sleeping tasks automatically resume when failpoint configurations change
|
||||
- **Backward compatibility**: Drop-in replacement for existing `fail` crate usage
|
||||
|
||||
## Supported Actions
|
||||
|
||||
- `off` - Disable the failpoint
|
||||
- `pause` - Pause indefinitely until disabled, reconfigured, or cancelled
|
||||
- `sleep(N)` - Sleep for N milliseconds (can be interrupted by reconfiguration)
|
||||
- `return` - Return early (empty value)
|
||||
- `return(value)` - Return early with a specific value
|
||||
- `exit` - Exit the process immediately
|
||||
- `panic(message)` - Panic the process with a custom message
|
||||
- `N%return(value)` - Return with a specific value N% of the time (probability-based)
|
||||
- `N%M*return(value)` - Return with a specific value N% of the time, maximum M times
|
||||
- `N%action` - Execute any action N% of the time (probability-based)
|
||||
- `N%M*action` - Execute any action N% of the time, maximum M times
|
||||
|
||||
## Probability-Based Actions
|
||||
|
||||
The library supports probability-based failpoints that trigger only a percentage of the time:
|
||||
|
||||
```rust
|
||||
// 50% chance to return a value
|
||||
configure_failpoint("random_failure", "50%return(error)").unwrap();
|
||||
|
||||
// 10% chance to sleep, maximum 3 times
|
||||
configure_failpoint("occasional_delay", "10%3*sleep(1000)").unwrap();
|
||||
|
||||
// 25% chance to panic
|
||||
configure_failpoint("rare_panic", "25%panic(critical error)").unwrap();
|
||||
```
|
||||
|
||||
The probability system uses a counter to track how many times a probability-based action has been triggered, allowing for precise control over test scenarios.
|
||||
|
||||
## Dynamic Behavior
|
||||
|
||||
When a failpoint is reconfigured while tasks are waiting on it:
|
||||
|
||||
- **Paused tasks** will immediately resume and continue normal execution
|
||||
- **Sleeping tasks** will wake up early and continue normal execution
|
||||
- **Removed failpoints** will cause all waiting tasks to resume normally
|
||||
|
||||
The new configuration only applies to future hits of the failpoint, not to tasks that are already waiting. This allows for flexible testing scenarios where you can pause execution, inspect state, and then resume execution dynamically.
|
||||
|
||||
## Example: Dynamic Reconfiguration
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{configure_failpoint, failpoint, FailpointResult};
|
||||
use tokio::time::Duration;
|
||||
|
||||
// Start a task that will hit a failpoint
|
||||
let task = tokio::spawn(async {
|
||||
println!("About to hit failpoint");
|
||||
match failpoint("test_pause", None).await {
|
||||
FailpointResult::Return(value) => println!("Returned: {}", value),
|
||||
FailpointResult::Continue => println!("Continued normally"),
|
||||
FailpointResult::Cancelled => println!("Cancelled"),
|
||||
}
|
||||
});
|
||||
|
||||
// Configure the failpoint to pause
|
||||
configure_failpoint("test_pause", "pause").unwrap();
|
||||
|
||||
// Let the task hit the failpoint and pause
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
// Change the failpoint configuration - this will wake up the paused task
|
||||
// The task will resume and continue normally (not apply the new config)
|
||||
configure_failpoint("test_pause", "return(not_applied)").unwrap();
|
||||
|
||||
// The task will complete with Continue, not Return
|
||||
let result = task.await.unwrap();
|
||||
```
|
||||
|
||||
## Basic Usage
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{configure_failpoint, failpoint, FailpointResult};
|
||||
|
||||
// Configure a failpoint
|
||||
configure_failpoint("my_failpoint", "return(42)").unwrap();
|
||||
|
||||
// Use the failpoint
|
||||
match failpoint("my_failpoint", None).await {
|
||||
FailpointResult::Return(value) => {
|
||||
println!("Failpoint returned: {}", value);
|
||||
return value.parse().unwrap_or_default();
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
// Continue normal execution
|
||||
}
|
||||
FailpointResult::Cancelled => {
|
||||
// Handle cancellation
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Context-Based Failpoint Configuration
|
||||
|
||||
Context allows you to create **conditional failpoints** that only trigger when specific runtime conditions are met. This is particularly useful for testing scenarios where you want to inject failures only for specific tenants, operations, or other contextual conditions.
|
||||
|
||||
### Configuring Context-Based Failpoints
|
||||
|
||||
Use `configure_failpoint_with_context()` to set up failpoints with context matching:
|
||||
|
||||
```rust
|
||||
use neon_failpoint::configure_failpoint_with_context;
|
||||
use std::collections::HashMap;
|
||||
|
||||
let mut context_matchers = HashMap::new();
|
||||
context_matchers.insert("tenant_id".to_string(), "test_.*".to_string());
|
||||
context_matchers.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
configure_failpoint_with_context(
|
||||
"backup_operation", // failpoint name
|
||||
"return(simulated_failure)", // action to take
|
||||
context_matchers // context matching rules
|
||||
).unwrap();
|
||||
```
|
||||
|
||||
### Context Matching Rules
|
||||
|
||||
The context matching system works as follows:
|
||||
|
||||
1. **Key-Value Matching**: Each entry in `context_matchers` specifies a key that must exist in the runtime context
|
||||
2. **Regex Support**: Values in `context_matchers` are treated as regular expressions first
|
||||
3. **Fallback to Exact Match**: If the regex compilation fails, it falls back to exact string matching
|
||||
4. **ALL Must Match**: All context matchers must match for the failpoint to trigger
|
||||
|
||||
### Runtime Context Usage
|
||||
|
||||
When code hits a failpoint, it provides context using a `HashMap<String, String>`:
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{failpoint, FailpointResult};
|
||||
use std::collections::HashMap;
|
||||
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
context.insert("user_id".to_string(), "user_456".to_string());
|
||||
|
||||
match failpoint("backup_operation", Some(&context)) {
|
||||
either::Either::Left(result) => {
|
||||
match result {
|
||||
FailpointResult::Return(value) => {
|
||||
// This will only trigger if ALL context matchers match
|
||||
println!("Backup failed: {}", value);
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
// Continue with normal backup operation
|
||||
}
|
||||
FailpointResult::Cancelled => {}
|
||||
}
|
||||
}
|
||||
either::Either::Right(future) => {
|
||||
match future.await {
|
||||
FailpointResult::Return(value) => {
|
||||
// This will only trigger if ALL context matchers match
|
||||
println!("Backup failed: {}", value);
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
// Continue with normal backup operation
|
||||
}
|
||||
FailpointResult::Cancelled => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Context Matching Examples
|
||||
|
||||
#### Regex Matching
|
||||
```rust
|
||||
// Configure to match test tenants only
|
||||
let mut matchers = HashMap::new();
|
||||
matchers.insert("tenant_id".to_string(), "test_.*".to_string());
|
||||
|
||||
configure_failpoint_with_context("test_failpoint", "pause", matchers).unwrap();
|
||||
|
||||
// This will match
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
// This will NOT match
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "prod_123".to_string());
|
||||
```
|
||||
|
||||
#### Multiple Conditions
|
||||
```rust
|
||||
// Must match BOTH tenant pattern AND operation
|
||||
let mut matchers = HashMap::new();
|
||||
matchers.insert("tenant_id".to_string(), "test_.*".to_string());
|
||||
matchers.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
configure_failpoint_with_context("backup_test", "return(failed)", matchers).unwrap();
|
||||
|
||||
// This will match (both conditions met)
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
// This will NOT match (missing operation)
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "restore".to_string());
|
||||
```
|
||||
|
||||
#### Exact String Matching
|
||||
```rust
|
||||
// If regex compilation fails, falls back to exact match
|
||||
let mut matchers = HashMap::new();
|
||||
matchers.insert("env".to_string(), "staging".to_string());
|
||||
|
||||
configure_failpoint_with_context("env_specific", "sleep(1000)", matchers).unwrap();
|
||||
|
||||
// This will match
|
||||
let mut context = HashMap::new();
|
||||
context.insert("env".to_string(), "staging".to_string());
|
||||
// This will NOT match
|
||||
let mut context = HashMap::new();
|
||||
context.insert("env".to_string(), "production".to_string());
|
||||
```
|
||||
|
||||
### Benefits of Context-Based Failpoints
|
||||
|
||||
1. **Selective Testing**: Only inject failures for specific tenants, environments, or operations
|
||||
2. **Production Safety**: Avoid accidentally triggering failpoints in production by using context filters
|
||||
3. **Complex Scenarios**: Test interactions between different components with targeted failures
|
||||
4. **Debugging**: Isolate issues to specific contexts without affecting the entire system
|
||||
|
||||
### Context vs. Non-Context Failpoints
|
||||
|
||||
- **Without context**: `configure_failpoint("name", "action")` - triggers for ALL hits
|
||||
- **With context**: `configure_failpoint_with_context("name", "action", matchers)` - triggers only when context matches
|
||||
|
||||
## Context-Specific Failpoints
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{configure_failpoint_with_context, failpoint};
|
||||
use std::collections::HashMap;
|
||||
|
||||
// Configure a failpoint that only triggers for specific tenants
|
||||
let mut context_matchers = HashMap::new();
|
||||
context_matchers.insert("tenant_id".to_string(), "test_.*".to_string());
|
||||
context_matchers.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
configure_failpoint_with_context(
|
||||
"backup_operation",
|
||||
"return(simulated_failure)",
|
||||
context_matchers
|
||||
).unwrap();
|
||||
|
||||
// Use with context
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
match failpoint("backup_operation", Some(&context)) {
|
||||
either::Either::Left(result) => {
|
||||
match result {
|
||||
FailpointResult::Return(value) => {
|
||||
// This will trigger for tenant_id matching "test_.*"
|
||||
println!("Backup failed: {}", value);
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
// Continue with backup
|
||||
}
|
||||
FailpointResult::Cancelled => {}
|
||||
}
|
||||
}
|
||||
either::Either::Right(future) => {
|
||||
match future.await {
|
||||
FailpointResult::Return(value) => {
|
||||
// This will trigger for tenant_id matching "test_.*"
|
||||
println!("Backup failed: {}", value);
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
// Continue with backup
|
||||
}
|
||||
FailpointResult::Cancelled => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Macros
|
||||
|
||||
The library provides convenient macros for common patterns:
|
||||
|
||||
### `fail_point!` - Basic Failpoint Macro
|
||||
|
||||
The `fail_point!` macro has three variants:
|
||||
|
||||
1. **Simple failpoint** - `fail_point!(name)`
|
||||
- Just checks the failpoint and continues or returns early (no value)
|
||||
- Panics if the failpoint is configured with `return(value)` since no closure is provided
|
||||
|
||||
2. **Failpoint with return handler** - `fail_point!(name, closure)`
|
||||
- Provides a closure to handle return values from the failpoint
|
||||
- The closure receives `Option<String>` and should return the appropriate value
|
||||
|
||||
3. **Conditional failpoint** - `fail_point!(name, condition, closure)`
|
||||
- Only checks the failpoint if the condition is true
|
||||
- Provides a closure to handle return values (receives `&str`)
|
||||
|
||||
```rust
|
||||
use neon_failpoint::fail_point;
|
||||
|
||||
// Simple failpoint - just continue or return early
|
||||
fail_point!("my_failpoint");
|
||||
|
||||
// Failpoint with return value handling
|
||||
fail_point!("my_failpoint", |value: Option<String>| {
|
||||
match value {
|
||||
Some(v) => {
|
||||
println!("Got value: {}", v);
|
||||
return Ok(v.parse().unwrap_or_default());
|
||||
}
|
||||
None => return Ok(42), // Default return value
|
||||
}
|
||||
});
|
||||
|
||||
// Conditional failpoint - only check if condition is met
|
||||
let should_fail = some_condition();
|
||||
fail_point!("conditional_failpoint", should_fail, |value: &str| {
|
||||
println!("Conditional failpoint triggered with: {}", value);
|
||||
return Err(anyhow::anyhow!("Simulated failure"));
|
||||
});
|
||||
```
|
||||
|
||||
### `fail_point_with_context!` - Context-Aware Failpoint Macro
|
||||
|
||||
The `fail_point_with_context!` macro has three variants that mirror `fail_point!` but include context:
|
||||
|
||||
1. **Simple with context** - `fail_point_with_context!(name, context)`
|
||||
2. **With context and return handler** - `fail_point_with_context!(name, context, closure)`
|
||||
3. **Conditional with context** - `fail_point_with_context!(name, context, condition, closure)`
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{fail_point_with_context};
|
||||
use std::collections::HashMap;
|
||||
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
// Simple context failpoint
|
||||
fail_point_with_context!("backup_failpoint", &context);
|
||||
|
||||
// Context failpoint with return handler
|
||||
fail_point_with_context!("backup_failpoint", &context, |value: Option<String>| {
|
||||
match value {
|
||||
Some(v) => return Err(anyhow::anyhow!("Backup failed: {}", v)),
|
||||
None => return Err(anyhow::anyhow!("Backup failed")),
|
||||
}
|
||||
});
|
||||
|
||||
// Conditional context failpoint
|
||||
let is_test_tenant = tenant_id.starts_with("test_");
|
||||
fail_point_with_context!("backup_failpoint", &context, is_test_tenant, |value: Option<String>| {
|
||||
// Only triggers for test tenants
|
||||
return Err(anyhow::anyhow!("Test tenant backup failure"));
|
||||
});
|
||||
```
|
||||
|
||||
### Other Utility Macros
|
||||
|
||||
```rust
|
||||
use neon_failpoint::{pausable_failpoint, sleep_millis_async};
|
||||
|
||||
// Pausable failpoint with cancellation
|
||||
let cancel_token = CancellationToken::new();
|
||||
if let Err(()) = pausable_failpoint!("pause_here", &cancel_token).await {
|
||||
println!("Failpoint was cancelled");
|
||||
}
|
||||
|
||||
// Sleep failpoint
|
||||
sleep_millis_async!("sleep_here", &cancel_token).await;
|
||||
|
||||
// Context creation helper
|
||||
let mut context = HashMap::new();
|
||||
context.insert("key1".to_string(), "value1".to_string());
|
||||
context.insert("key2".to_string(), "value2".to_string());
|
||||
```
|
||||
|
||||
### Argument Reference
|
||||
|
||||
- **`name`**: String literal - the name of the failpoint
|
||||
- **`context`**: Expression that evaluates to `&HashMap<String, String>` - context for matching
|
||||
- **`condition`**: Boolean expression - only check failpoint if true
|
||||
- **`closure`**: Closure that handles return values:
|
||||
- For `fail_point!` with closure: receives `Option<String>`
|
||||
- For conditional variants: receives `&str`
|
||||
- For `fail_point_with_context!` with closure: receives `Option<String>`
|
||||
- **`cancel`**: `&CancellationToken` - for cancellation support
|
||||
|
||||
## Migration from `fail` crate
|
||||
|
||||
The library provides a compatibility layer in `libs/utils/src/failpoint_support.rs`. Most existing code should work without changes, but you can migrate to the new async APIs for better performance:
|
||||
|
||||
### Before (with `fail` crate):
|
||||
```rust
|
||||
use utils::failpoint_support::pausable_failpoint;
|
||||
|
||||
// This used spawn_blocking internally
|
||||
pausable_failpoint!("my_failpoint", &cancel_token).await?;
|
||||
```
|
||||
|
||||
### After (with `neon_failpoint`):
|
||||
```rust
|
||||
use neon_failpoint::{failpoint_with_cancellation, FailpointResult};
|
||||
|
||||
// This is fully async
|
||||
match failpoint_with_cancellation("my_failpoint", None, &cancel_token).await {
|
||||
FailpointResult::Continue => {},
|
||||
FailpointResult::Cancelled => return Err(()),
|
||||
FailpointResult::Return(_) => {},
|
||||
}
|
||||
```
|
||||
|
||||
## Environment Variable Support
|
||||
|
||||
Failpoints can be configured via the `FAILPOINTS` environment variable:
|
||||
|
||||
```bash
|
||||
FAILPOINTS="failpoint1=return(42);failpoint2=sleep(1000);failpoint3=exit"
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
The library includes comprehensive tests and examples. Run them with:
|
||||
|
||||
```bash
|
||||
cargo test --features testing
|
||||
cargo run --example context_demo --features testing
|
||||
```
|
||||
|
||||
## HTTP Configuration
|
||||
|
||||
The library integrates with the existing HTTP failpoint configuration API. Send POST requests to `/v1/failpoints` with:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "my_failpoint",
|
||||
"actions": "return(42)"
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -1,82 +0,0 @@
|
||||
use neon_failpoint::{configure_failpoint_with_context, failpoint, FailpointResult};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
// Initialize tracing for better output
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
// Set up a context-specific failpoint
|
||||
let mut context_matchers = HashMap::new();
|
||||
context_matchers.insert("tenant_id".to_string(), "test_.*".to_string());
|
||||
context_matchers.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
configure_failpoint_with_context(
|
||||
"backup_operation",
|
||||
"return(simulated_failure)",
|
||||
context_matchers,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Test with matching context
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "test_123".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
println!("Testing with matching context...");
|
||||
match failpoint("backup_operation", Some(&context)) {
|
||||
either::Either::Left(result) => match result {
|
||||
FailpointResult::Return(value) => {
|
||||
println!("Failpoint triggered with value: {value:?}");
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
println!("Failpoint not triggered");
|
||||
}
|
||||
FailpointResult::Cancelled => {
|
||||
println!("Failpoint cancelled");
|
||||
}
|
||||
},
|
||||
either::Either::Right(future) => match future.await {
|
||||
FailpointResult::Return(value) => {
|
||||
println!("Failpoint triggered with value: {value:?}");
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
println!("Failpoint not triggered");
|
||||
}
|
||||
FailpointResult::Cancelled => {
|
||||
println!("Failpoint cancelled");
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Test with non-matching context
|
||||
let mut context = HashMap::new();
|
||||
context.insert("tenant_id".to_string(), "prod_456".to_string());
|
||||
context.insert("operation".to_string(), "backup".to_string());
|
||||
|
||||
println!("Testing with non-matching context...");
|
||||
match failpoint("backup_operation", Some(&context)) {
|
||||
either::Either::Left(result) => match result {
|
||||
FailpointResult::Return(value) => {
|
||||
println!("Failpoint triggered with value: {value:?}");
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
println!("Failpoint not triggered (expected)");
|
||||
}
|
||||
FailpointResult::Cancelled => {
|
||||
println!("Failpoint cancelled");
|
||||
}
|
||||
},
|
||||
either::Either::Right(future) => match future.await {
|
||||
FailpointResult::Return(value) => {
|
||||
println!("Failpoint triggered with value: {value:?}");
|
||||
}
|
||||
FailpointResult::Continue => {
|
||||
println!("Failpoint not triggered (expected)");
|
||||
}
|
||||
FailpointResult::Cancelled => {
|
||||
println!("Failpoint cancelled");
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,356 +0,0 @@
|
||||
//! Macros for convenient failpoint usage
|
||||
|
||||
/// Simple failpoint macro - async version that doesn't require spawn_blocking
|
||||
#[macro_export]
|
||||
macro_rules! fail_point {
|
||||
($name:literal) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $condition:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
if $condition {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Simple failpoint macro - sync version that panics if async action is triggered
|
||||
#[macro_export]
|
||||
macro_rules! fail_point_sync {
|
||||
($name:literal) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_sync! was used. Use fail_point! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_sync! was used. Use fail_point! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $condition:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
if $condition {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_sync! was used. Use fail_point! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Failpoint macro with context support
|
||||
#[macro_export]
|
||||
macro_rules! fail_point_with_context {
|
||||
($name:literal, $context:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $context:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $context:expr, $condition:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
if $condition {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(future) => {
|
||||
match future.await {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Failpoint macro with context support - sync version
|
||||
#[macro_export]
|
||||
macro_rules! fail_point_with_context_sync {
|
||||
($name:literal, $context:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(_) => {
|
||||
panic!("failpoint was configured with return(X) but Rust code does not pass a closure to map X to a return value");
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_with_context_sync! was used. Use fail_point_with_context! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $context:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_with_context_sync! was used. Use fail_point_with_context! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $context:expr, $condition:expr, $closure:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
if $condition {
|
||||
match $crate::failpoint($name, Some($context)) {
|
||||
$crate::either::Either::Left(result) => {
|
||||
match result {
|
||||
$crate::FailpointResult::Continue => {},
|
||||
$crate::FailpointResult::Return(value) => {
|
||||
let closure = $closure;
|
||||
return closure(value);
|
||||
},
|
||||
$crate::FailpointResult::Cancelled => {},
|
||||
}
|
||||
},
|
||||
$crate::either::Either::Right(_) => {
|
||||
panic!("failpoint '{}' triggered an async action (sleep/pause) but fail_point_with_context_sync! was used. Use fail_point_with_context! instead.", $name);
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Pausable failpoint macro - equivalent to the old pausable_failpoint
|
||||
#[macro_export]
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
let cancel = ::tokio_util::sync::CancellationToken::new();
|
||||
let _ = $crate::pausable_failpoint!($name, &cancel);
|
||||
}
|
||||
}};
|
||||
($name:literal, $cancel:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
::tracing::info!("at failpoint {}", $name); // tests rely on this
|
||||
match $crate::failpoint_with_cancellation($name, None, $cancel) {
|
||||
$crate::either::Either::Left(result) => match result {
|
||||
$crate::FailpointResult::Continue => Ok(()),
|
||||
$crate::FailpointResult::Return(_) => Ok(()),
|
||||
$crate::FailpointResult::Cancelled => Err(()),
|
||||
},
|
||||
$crate::either::Either::Right(future) => match future.await {
|
||||
$crate::FailpointResult::Continue => Ok(()),
|
||||
$crate::FailpointResult::Return(_) => Ok(()),
|
||||
$crate::FailpointResult::Cancelled => Err(()),
|
||||
},
|
||||
}
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Sleep failpoint macro - for async sleep operations
|
||||
#[macro_export]
|
||||
macro_rules! sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint($name, None) {
|
||||
$crate::either::Either::Left(_) => {}
|
||||
$crate::either::Either::Right(future) => {
|
||||
future.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:literal, $cancel:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
match $crate::failpoint_with_cancellation($name, None, $cancel) {
|
||||
$crate::either::Either::Left(_) => {}
|
||||
$crate::either::Either::Right(future) => {
|
||||
future.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
// Re-export for convenience
|
||||
pub use fail_point;
|
||||
pub use fail_point_sync;
|
||||
pub use fail_point_with_context;
|
||||
pub use fail_point_with_context_sync;
|
||||
pub use pausable_failpoint;
|
||||
pub use sleep_millis_async;
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_https_port: Option<u16>,
|
||||
|
||||
pub availability_zone_id: AvailabilityZone,
|
||||
|
||||
// Reachable IP address of the PS/SK registering, if known.
|
||||
// Hadron Cluster Coordiantor will update the DNS record of the registering node
|
||||
// with this IP address.
|
||||
pub node_ip_addr: Option<IpAddr>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantTimelineDescribeResponse {
|
||||
pub shards: Vec<TimelineInfo>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShardResponse {
|
||||
pub node_id: NodeId,
|
||||
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineSafekeeperPeer {
|
||||
pub node_id: NodeId,
|
||||
pub listen_http_addr: String,
|
||||
pub http_port: i32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimeline {
|
||||
// SC does not know the tenant id.
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SCSafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimeline {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
|
||||
@@ -1622,6 +1622,9 @@ pub struct TimelineInfo {
|
||||
|
||||
/// Whether the timeline is invisible in synthetic size calculations.
|
||||
pub is_invisible: Option<bool>,
|
||||
// HADRON: the largest LSN below which all page updates have been included in the image layers.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -749,7 +749,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
match e {
|
||||
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
|
||||
err @ QueryError::Shutdown => {
|
||||
// Notify postgres of the connection shutdown at the libpq
|
||||
// protocol level. This avoids postgres having to tell apart
|
||||
// from an idle connection and a stale one, which is bug prone.
|
||||
let shutdown_error = short_error(&err);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&shutdown_error,
|
||||
Some(err.pg_error_code()),
|
||||
))?;
|
||||
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
QueryError::SimulatedConnectionError => {
|
||||
return Err(QueryError::SimulatedConnectionError);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user