mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 12:00:42 +00:00
Compare commits
37 Commits
iddm/commu
...
cloneable/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef737e7d7c | ||
|
|
5c934efb29 | ||
|
|
5c9c3b3317 | ||
|
|
921a4f2009 | ||
|
|
eb93c3e3c6 | ||
|
|
7a7ab2a1d1 | ||
|
|
ff526a1051 | ||
|
|
9a2456bea5 | ||
|
|
a456e818af | ||
|
|
3e6fdb0aa6 | ||
|
|
f8d3f86f58 | ||
|
|
f67a8a173e | ||
|
|
2288efae66 | ||
|
|
4fedcbc0ac | ||
|
|
eb830fa547 | ||
|
|
a203f9829a | ||
|
|
42ab34dc36 | ||
|
|
30b877074c | ||
|
|
f18cc808f0 | ||
|
|
d14d8271b8 | ||
|
|
fecb707b19 | ||
|
|
296c9190b2 | ||
|
|
a5fe67f361 | ||
|
|
ee7bb1a667 | ||
|
|
9bba31bf68 | ||
|
|
380d167b7c | ||
|
|
cb991fba42 | ||
|
|
4566b12a22 | ||
|
|
63ca084696 | ||
|
|
379259bdd7 | ||
|
|
3300207523 | ||
|
|
a0a7733b5a | ||
|
|
f4245403b3 | ||
|
|
a8db7ebffb | ||
|
|
154f6dc59c | ||
|
|
15f633922a | ||
|
|
c34d36d8a2 |
@@ -27,4 +27,4 @@
|
||||
!storage_controller/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!build_tools/patches
|
||||
!build-tools/patches
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -31,6 +31,7 @@ config-variables:
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- PREWARM_PGBENCH_SIZE
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
|
||||
@@ -176,7 +176,11 @@ runs:
|
||||
fi
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
# We don't use code coverage for regression tests (the step is disabled),
|
||||
# so there's no need to collect it.
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
# cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
@@ -150,7 +150,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
@@ -162,7 +162,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
|
||||
use-fallback: false
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
|
||||
|
||||
- name: Build all
|
||||
# Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
|
||||
|
||||
72
.github/workflows/benchmarking.yml
vendored
72
.github/workflows/benchmarking.yml
vendored
@@ -219,6 +219,7 @@ jobs:
|
||||
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
|
||||
--ignore test_runner/performance/test_perf_many_relations.py
|
||||
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
|
||||
--ignore test_runner/performance/test_lfc_prewarm.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -410,6 +411,77 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
prewarm-test:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 17
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Run prewarm benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_lfc_prewarm.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
|
||||
DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('build-tools.Dockerfile',
|
||||
${{ hashFiles('build-tools/Dockerfile',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
run: |
|
||||
echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT}
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
|
||||
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
file: build-tools.Dockerfile
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -15,7 +15,6 @@ neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
pgxn/neon/communicator/communicator_bindings.h
|
||||
docker-compose/docker-compose-parallel.yml
|
||||
|
||||
# Coverage
|
||||
|
||||
8
.gitmodules
vendored
8
.gitmodules
vendored
@@ -1,16 +1,16 @@
|
||||
[submodule "vendor/postgres-v14"]
|
||||
path = vendor/postgres-v14
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_14_STABLE_neon
|
||||
[submodule "vendor/postgres-v15"]
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "vendor/postgres-v16"]
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
[submodule "vendor/postgres-v17"]
|
||||
path = vendor/postgres-v17
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
url = ../postgres.git
|
||||
branch = REL_17_STABLE_neon
|
||||
|
||||
357
Cargo.lock
generated
357
Cargo.lock
generated
@@ -253,17 +253,6 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_enum"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -698,40 +687,13 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core 0.4.5",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"itoa",
|
||||
"matchit 0.7.3",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"axum-core 0.5.0",
|
||||
"axum-core",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
@@ -739,10 +701,10 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit 0.8.4",
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -762,26 +724,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.5.0"
|
||||
@@ -808,8 +750,8 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
"futures-util",
|
||||
@@ -1346,31 +1288,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.0.0"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"atomic_enum",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"clashmap",
|
||||
"http 1.1.0",
|
||||
"libc",
|
||||
"metrics",
|
||||
"neon-shmem",
|
||||
"nix 0.30.1",
|
||||
"pageserver_api",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"prometheus",
|
||||
"prost 0.13.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-pipe",
|
||||
"tonic 0.12.3",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uring-common",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -1400,7 +1321,7 @@ dependencies = [
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
@@ -1705,9 +1626,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
version = "0.8.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
@@ -2161,7 +2082,7 @@ name = "endpoint_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -2422,12 +2343,6 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.1"
|
||||
@@ -2448,7 +2363,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
@@ -2618,18 +2533,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.2+wasi-0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
@@ -2795,16 +2698,6 @@ version = "0.15.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.4"
|
||||
source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.9.1"
|
||||
@@ -3029,9 +2922,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.10.1"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
||||
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@@ -3081,9 +2974,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.6.0"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
|
||||
checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
@@ -3123,7 +3016,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
@@ -3138,7 +3031,7 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
|
||||
dependencies = [
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
@@ -3147,21 +3040,20 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.14"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc2fdfdbff08affe55bb779f33b053aa1fe5dd5b54c257343c17edfa55711bdb"
|
||||
checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"hyper 1.6.0",
|
||||
"libc",
|
||||
"hyper 1.4.1",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower 0.4.13",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
@@ -3714,9 +3606,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.13"
|
||||
version = "0.4.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
|
||||
checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
@@ -3759,12 +3651,6 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3872,8 +3758,8 @@ dependencies = [
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr 0.4.3",
|
||||
"twox-hash 1.6.3",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3960,33 +3846,10 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"criterion",
|
||||
"foldhash",
|
||||
"hashbrown 0.15.4",
|
||||
"libc",
|
||||
"lock_api",
|
||||
"nix 0.30.1",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"rustc-hash 2.1.1",
|
||||
"seahash",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"twox-hash 2.1.1",
|
||||
"workspace_hack",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"spin",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4422,20 +4285,18 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"pprof",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -4520,7 +4381,6 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"pageserver_compaction",
|
||||
"pageserver_page_api",
|
||||
"peekable",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -4534,7 +4394,6 @@ dependencies = [
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"prost 0.13.5",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
@@ -4571,7 +4430,7 @@ dependencies = [
|
||||
"tower 0.5.2",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"twox-hash 1.6.3",
|
||||
"twox-hash",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
@@ -4783,7 +4642,7 @@ dependencies = [
|
||||
"paste",
|
||||
"seq-macro",
|
||||
"thrift",
|
||||
"twox-hash 1.6.3",
|
||||
"twox-hash",
|
||||
"zstd",
|
||||
"zstd-sys",
|
||||
]
|
||||
@@ -4829,15 +4688,6 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peekable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -5440,6 +5290,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
"aws-config",
|
||||
"aws-credential-types",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.22.1",
|
||||
@@ -5452,6 +5303,7 @@ dependencies = [
|
||||
"clashmap",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"criterion",
|
||||
"ecdsa 0.16.9",
|
||||
"ed25519-dalek",
|
||||
"env_logger",
|
||||
@@ -5471,7 +5323,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"ipnet",
|
||||
@@ -5479,6 +5331,7 @@ dependencies = [
|
||||
"itoa",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"json",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -5495,7 +5348,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rand_distr 0.4.3",
|
||||
"rand_distr",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -5599,12 +5452,6 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5629,16 +5476,6 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -5659,16 +5496,6 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -5687,15 +5514,6 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
@@ -5706,16 +5524,6 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -5914,7 +5722,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-types",
|
||||
"humantime-serde",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -5954,7 +5762,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-rustls 0.26.0",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
@@ -6011,7 +5819,7 @@ dependencies = [
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6032,7 +5840,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit 0.8.4",
|
||||
"matchit",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6404,6 +6212,7 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
@@ -6448,7 +6257,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
@@ -6519,12 +6328,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
|
||||
|
||||
[[package]]
|
||||
name = "seahash"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
|
||||
|
||||
[[package]]
|
||||
name = "sec1"
|
||||
version = "0.3.0"
|
||||
@@ -6986,12 +6789,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.5.10"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
|
||||
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6999,9 +6802,6 @@ name = "spin"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
@@ -7060,7 +6860,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -7196,6 +6996,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"tokio",
|
||||
@@ -7669,16 +7470,6 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-pipe"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
@@ -7775,6 +7566,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7873,25 +7665,16 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum 0.7.9",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.13.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tower 0.4.13",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -7904,7 +7687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"flate2",
|
||||
@@ -7912,7 +7695,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
@@ -7965,16 +7748,11 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"indexmap 1.9.3",
|
||||
"pin-project",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8245,15 +8023,6 @@ dependencies = [
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
|
||||
dependencies = [
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typed-json"
|
||||
version = "0.1.1"
|
||||
@@ -8467,7 +8236,7 @@ name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -8579,15 +8348,6 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
@@ -8945,15 +8705,6 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
@@ -8961,8 +8712,8 @@ dependencies = [
|
||||
"ahash",
|
||||
"anstream",
|
||||
"anyhow",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
@@ -8996,7 +8747,7 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.6.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"itertools 0.12.1",
|
||||
@@ -9121,12 +8872,6 @@ version = "0.13.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
|
||||
|
||||
[[package]]
|
||||
name = "xxhash-rust"
|
||||
version = "0.8.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
|
||||
|
||||
[[package]]
|
||||
name = "yasna"
|
||||
version = "0.5.2"
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -35,7 +35,6 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -93,7 +92,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
|
||||
clashmap = { version = "1.0", features = ["raw-api"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crossbeam-utils = "0.8.21"
|
||||
crc32c = "0.6"
|
||||
diatomic-waker = { version = "0.2.3" }
|
||||
either = "1.8"
|
||||
@@ -152,7 +150,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -189,7 +186,6 @@ smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
spki = "0.7.3"
|
||||
spin = "0.9.8"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
"subtle" = "2.5.0"
|
||||
@@ -201,10 +197,11 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-stream = { version = "0.1", features = ["sync"] }
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
@@ -242,9 +239,6 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
@@ -9,7 +9,7 @@
|
||||
#
|
||||
# build-tools: This contains Rust compiler toolchain and other tools needed at compile
|
||||
# time. This is also used for the storage builds. This image is defined in
|
||||
# build-tools.Dockerfile.
|
||||
# build-tools/Dockerfile.
|
||||
#
|
||||
# build-deps: Contains C compiler, other build tools, and compile-time dependencies
|
||||
# needed to compile PostgreSQL and most extensions. (Some extensions need
|
||||
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
|
||||
FROM $BASE_IMAGE_SHA AS build-deps
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Keep in sync with build-tools.Dockerfile
|
||||
# Keep in sync with build-tools/Dockerfile
|
||||
ENV PROTOC_VERSION=25.1
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
@@ -1790,7 +1790,7 @@ RUN set -e \
|
||||
#########################################################################################
|
||||
FROM build-deps AS exporters
|
||||
ARG TARGETARCH
|
||||
# Keep sql_exporter version same as in build-tools.Dockerfile and
|
||||
# Keep sql_exporter version same as in build-tools/Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
@@ -6,8 +6,7 @@ use compute_api::responses::{
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
|
||||
PageserverShardConnectionInfo, PgIdent,
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
@@ -226,7 +225,7 @@ pub struct ParsedSpec {
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
/// k8s dns name and port
|
||||
@@ -273,27 +272,6 @@ impl ParsedSpec {
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_pageserver_conninfo_from_guc(
|
||||
pageserver_connstring_guc: &str,
|
||||
) -> PageserverConnectionInfo {
|
||||
PageserverConnectionInfo {
|
||||
shards: pageserver_connstring_guc
|
||||
.split(',')
|
||||
.enumerate()
|
||||
.map(|(i, connstr)| {
|
||||
(
|
||||
i as u32,
|
||||
PageserverShardConnectionInfo {
|
||||
libpq_url: Some(connstr.to_string()),
|
||||
grpc_url: None,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
prefer_grpc: false,
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
@@ -303,17 +281,11 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
// For backwards-compatibility, the top-level fields in the spec file
|
||||
// may be empty. In that case, we need to dig them from the GUCs in the
|
||||
// cluster.settings field.
|
||||
let pageserver_conninfo = match &spec.pageserver_connection_info {
|
||||
Some(x) => x.clone(),
|
||||
None => {
|
||||
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
|
||||
extract_pageserver_conninfo_from_guc(&guc)
|
||||
} else {
|
||||
return Err("pageserver connstr should be provided".to_string());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let pageserver_connstr = spec
|
||||
.pageserver_connstring
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
@@ -363,7 +335,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
|
||||
let res = ParsedSpec {
|
||||
spec,
|
||||
pageserver_conninfo,
|
||||
pageserver_connstr,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
@@ -453,7 +425,7 @@ impl ComputeNode {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -1060,13 +1032,16 @@ impl ComputeNode {
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let started = Instant::now();
|
||||
let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
|
||||
self.try_get_basebackup_grpc(spec, lsn)?
|
||||
} else {
|
||||
self.try_get_basebackup_libpq(spec, lsn)?
|
||||
|
||||
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros =
|
||||
connected.duration_since(started).as_micros() as u64;
|
||||
@@ -1076,24 +1051,44 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Move the Zenith signal file to Neon signal file location.
|
||||
/// This makes Compute compatible with older PageServers that don't yet
|
||||
/// know about the Zenith->Neon rename.
|
||||
fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
|
||||
let datadir = Path::new(&self.params.pgdata);
|
||||
|
||||
let neonsig = datadir.join("neon.signal");
|
||||
|
||||
if neonsig.is_file() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let zenithsig = datadir.join("zenith.signal");
|
||||
|
||||
if zenithsig.is_file() {
|
||||
fs::copy(zenithsig, neonsig)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0 = spec
|
||||
.pageserver_conninfo
|
||||
.shards
|
||||
.get(&0)
|
||||
.expect("shard 0 connection info missing");
|
||||
let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
|
||||
|
||||
let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
|
||||
0 | 1 => ShardIndex::unsharded(),
|
||||
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
|
||||
};
|
||||
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_url,
|
||||
shard0_connstr,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
shard_index,
|
||||
@@ -1128,13 +1123,8 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
|
||||
/// when the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0 = spec
|
||||
.pageserver_conninfo
|
||||
.shards
|
||||
.get(&0)
|
||||
.expect("shard 0 connection info missing");
|
||||
let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
|
||||
let mut config = postgres::Config::from_str(&shard0_connstr)?;
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
@@ -1220,7 +1210,10 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1293,9 +1286,7 @@ impl ComputeNode {
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
// and we don't yet retry.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
@@ -1429,8 +1420,16 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
|
||||
self.get_basebackup(compute_state, lsn)
|
||||
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(compute_state, lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
@@ -2096,7 +2095,7 @@ LIMIT 100",
|
||||
self.params
|
||||
.remote_ext_base_url
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow!(
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
@@ -2292,7 +2291,7 @@ LIMIT 100",
|
||||
let remote_extensions = spec
|
||||
.remote_extensions
|
||||
.as_ref()
|
||||
.ok_or(anyhow!("Remote extensions are not configured"))?;
|
||||
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
@@ -2371,22 +2370,22 @@ LIMIT 100",
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_conninfo = state
|
||||
let old_pageserver_connstr = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_conninfo
|
||||
.pageserver_connstr
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_conninfo = &s
|
||||
let pageserver_connstr = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_conninfo;
|
||||
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
@@ -2486,7 +2485,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
serde_json::to_string(&extensions).expect("failed to serialize extensions list")
|
||||
);
|
||||
}
|
||||
Err(err) => error!("could not get installed extensions: {err:?}"),
|
||||
Err(err) => error!("could not get installed extensions: {err}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -56,51 +56,9 @@ pub fn write_postgres_conf(
|
||||
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
|
||||
if let Some(conninfo) = &spec.pageserver_connection_info {
|
||||
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
|
||||
for shardno in 0..conninfo.shards.len() {
|
||||
let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
|
||||
anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
|
||||
})?;
|
||||
|
||||
if let Some(url) = &info.libpq_url {
|
||||
if let Some(ref mut urls) = libpq_urls {
|
||||
urls.push(url.clone());
|
||||
}
|
||||
} else {
|
||||
libpq_urls = None
|
||||
}
|
||||
if let Some(url) = &info.grpc_url {
|
||||
if let Some(ref mut urls) = grpc_urls {
|
||||
urls.push(url.clone());
|
||||
}
|
||||
} else {
|
||||
grpc_urls = None
|
||||
}
|
||||
}
|
||||
if let Some(libpq_urls) = libpq_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring={}",
|
||||
escape_conf_value(&libpq_urls.join(","))
|
||||
)?;
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_connstring")?;
|
||||
}
|
||||
if let Some(grpc_urls) = grpc_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_grpc_urls={}",
|
||||
escape_conf_value(&grpc_urls.join(","))
|
||||
)?;
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_grpc_urls")?;
|
||||
}
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use tokio_postgres::error::Error as PostgresError;
|
||||
use tokio_postgres::{Client, Config, NoTls};
|
||||
|
||||
use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
/// and to make database listing query here more explicit.
|
||||
///
|
||||
/// Limit the number of databases to 500 to avoid excessive load.
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
|
||||
// `pg_database.datconnlimit = -2` means that the database is in the
|
||||
// invalid state
|
||||
let databases = client
|
||||
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
/// Same extension can be installed in multiple databases with different versions,
|
||||
/// so we report a separate metric (number of databases where it is installed)
|
||||
/// for each extension version.
|
||||
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
|
||||
pub async fn get_installed_extensions(
|
||||
mut conf: Config,
|
||||
) -> Result<InstalledExtensions, PostgresError> {
|
||||
conf.application_name("compute_ctl:get_installed_extensions");
|
||||
let databases: Vec<String> = {
|
||||
let (mut client, connection) = conf.connect(NoTls).await?;
|
||||
|
||||
@@ -4,7 +4,8 @@ use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
@@ -77,16 +78,17 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let (conninfo, auth) = {
|
||||
let (connstrings, auth) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
(
|
||||
spec.pageserver_conninfo.clone(),
|
||||
spec.pageserver_connstr.clone(),
|
||||
spec.storage_auth_token.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
let result =
|
||||
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
@@ -110,16 +112,17 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
/// Tries to acquire LSN leases on all Pageserver shards.
|
||||
fn try_acquire_lsn_lease(
|
||||
conninfo: PageserverConnectionInfo,
|
||||
connstrings: &str,
|
||||
auth: Option<&str>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let shard_count = conninfo.shards.len();
|
||||
let connstrings = connstrings.split(',').collect_vec();
|
||||
let shard_count = connstrings.len();
|
||||
let mut leases = Vec::new();
|
||||
|
||||
for (shard_number, shard) in conninfo.shards.into_iter() {
|
||||
for (shard_number, &connstring) in connstrings.iter().enumerate() {
|
||||
let tenant_shard_id = match shard_count {
|
||||
0 | 1 => TenantShardId::unsharded(tenant_id),
|
||||
shard_count => TenantShardId {
|
||||
@@ -129,22 +132,13 @@ fn try_acquire_lsn_lease(
|
||||
},
|
||||
};
|
||||
|
||||
let lease = if conninfo.prefer_grpc {
|
||||
acquire_lsn_lease_grpc(
|
||||
&shard.grpc_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
} else {
|
||||
acquire_lsn_lease_libpq(
|
||||
&shard.libpq_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
let lease = match PageserverProtocol::from_connstring(connstring)? {
|
||||
PageserverProtocol::Libpq => {
|
||||
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
PageserverProtocol::Grpc => {
|
||||
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
};
|
||||
leases.push(lease);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use std::time::Duration;
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
@@ -1516,35 +1516,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
)?;
|
||||
}
|
||||
|
||||
let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id).unwrap();
|
||||
let libpq_url = Some({
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
format!("postgres://no_user@{host}:{port}")
|
||||
});
|
||||
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
None
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
let pageserver = PageserverShardConnectionInfo {
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
|
||||
|shard| async move {
|
||||
let pageservers = futures::future::try_join_all(
|
||||
locate_result.shards.into_iter().map(|shard| async move {
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
// Initialize LSN leases for static computes.
|
||||
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
|
||||
@@ -1556,34 +1550,28 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.await?;
|
||||
}
|
||||
|
||||
let libpq_host = Host::parse(&shard.listen_pg_addr)?;
|
||||
let libpq_port = shard.listen_pg_port;
|
||||
let libpq_url =
|
||||
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
|
||||
|
||||
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
|
||||
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
|
||||
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
|
||||
let pageserver = if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
let pageserver = PageserverShardConnectionInfo {
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
|
||||
},
|
||||
))
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
let stripe_size = locate_result.shard_params.stripe_size;
|
||||
|
||||
(shards, stripe_size)
|
||||
};
|
||||
assert!(!shards.is_empty());
|
||||
let pageserver_conninfo = PageserverConnectionInfo {
|
||||
shards: shards.into_iter().collect(),
|
||||
prefer_grpc: endpoint.grpc,
|
||||
(pageservers, stripe_size)
|
||||
};
|
||||
assert!(!pageservers.is_empty());
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||
@@ -1613,7 +1601,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageserver_conninfo,
|
||||
pageservers,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
@@ -1632,27 +1620,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
let libpq_url = Some({
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
format!("postgres://no_user@{host}:{port}")
|
||||
});
|
||||
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
None
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
let pageserver = PageserverShardConnectionInfo {
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
vec![(0, pageserver)]
|
||||
vec![pageserver]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
@@ -1662,36 +1643,28 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
// Use gRPC if requested.
|
||||
let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
|
||||
let libpq_port = shard.listen_pg_port;
|
||||
let libpq_url =
|
||||
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
|
||||
|
||||
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
|
||||
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
|
||||
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
|
||||
if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
(
|
||||
shard.shard_id.shard_number.0 as u32,
|
||||
PageserverShardConnectionInfo {
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
},
|
||||
)
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let pageserver_conninfo = PageserverConnectionInfo {
|
||||
shards: shards.into_iter().collect(),
|
||||
prefer_grpc: endpoint.grpc,
|
||||
};
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(&args.safekeepers)?;
|
||||
endpoint
|
||||
.reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
|
||||
.reconfigure(Some(pageservers), None, safekeepers, None)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Stop(args) => {
|
||||
|
||||
@@ -36,7 +36,7 @@ impl StorageBroker {
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
let broker = &self.env.broker;
|
||||
|
||||
print!("Starting neon broker at {}", broker.client_url());
|
||||
println!("Starting neon broker at {}", broker.client_url());
|
||||
|
||||
let mut args = Vec::new();
|
||||
|
||||
|
||||
@@ -32,7 +32,8 @@
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
//! neon.signal
|
||||
//! zenith.signal - copy of neon.signal, for backward compatibility
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
@@ -56,13 +57,9 @@ use compute_api::responses::{
|
||||
TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
||||
PgIdent, RemoteExtSpec, Role,
|
||||
};
|
||||
|
||||
// re-export these, because they're used in the reconfigure() function
|
||||
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
|
||||
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
|
||||
@@ -78,6 +75,7 @@ use sha2::{Digest, Sha256};
|
||||
use spki::der::Decode;
|
||||
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
@@ -382,7 +380,7 @@ pub struct EndpointStartArgs {
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
@@ -466,7 +464,7 @@ impl Endpoint {
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_level", "logical");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
// It also defines how often the walreceiver will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.pg_address.ip().to_string());
|
||||
conf.append("port", &self.pg_address.port().to_string());
|
||||
@@ -656,6 +654,14 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
@@ -701,6 +707,9 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
@@ -759,7 +768,7 @@ impl Endpoint {
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connection_info: Some(args.pageserver_conninfo),
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
@@ -973,7 +982,7 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
pageserver_conninfo: Option<PageserverConnectionInfo>,
|
||||
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
safekeeper_generation: Option<SafekeeperGeneration>,
|
||||
@@ -989,17 +998,15 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
if let Some(pageserver_conninfo) = pageserver_conninfo {
|
||||
// If pageservers are provided, we need to ensure that they are not empty.
|
||||
// This is a requirement for the compute_ctl configuration.
|
||||
anyhow::ensure!(
|
||||
!pageserver_conninfo.shards.is_empty(),
|
||||
"no pageservers provided"
|
||||
);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo);
|
||||
}
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
// If pageservers are not specified, don't change them.
|
||||
if let Some(pageservers) = pageservers {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
@@ -1048,7 +1055,7 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure_pageservers(
|
||||
&self,
|
||||
pageservers: PageserverConnectionInfo,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
||||
|
||||
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
|
||||
pub posthog_config: Option<PostHogConfig>,
|
||||
|
||||
pub kick_secondary_downloads: Option<bool>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub shard_split_request_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
|
||||
timeline_safekeeper_count: None,
|
||||
posthog_config: None,
|
||||
kick_secondary_downloads: None,
|
||||
shard_split_request_timeout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,7 +303,7 @@ impl PageServerNode {
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
println!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
|
||||
@@ -127,7 +127,7 @@ impl SafekeeperNode {
|
||||
extra_opts: &[String],
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
print!(
|
||||
println!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
|
||||
@@ -648,6 +648,13 @@ impl StorageController {
|
||||
args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
|
||||
}
|
||||
|
||||
if let Some(duration) = self.config.shard_split_request_timeout {
|
||||
args.push(format!(
|
||||
"--shard-split-request-timeout={}",
|
||||
humantime::Duration::from(duration)
|
||||
));
|
||||
}
|
||||
|
||||
let mut envs = vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
@@ -660,7 +667,7 @@ impl StorageController {
|
||||
));
|
||||
}
|
||||
|
||||
println!("Starting storage controller");
|
||||
println!("Starting storage controller at {scheme}://{host}:{listen_port}");
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
|
||||
@@ -14,6 +14,7 @@ humantime.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
safekeeper_api.workspace=true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
|
||||
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Certificate, Method, StatusCode, Url};
|
||||
use safekeeper_api::models::TimelineLocateResponse;
|
||||
use storage_controller_client::control_api::Client;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -279,6 +280,23 @@ enum Command {
|
||||
#[arg(long)]
|
||||
concurrency: Option<usize>,
|
||||
},
|
||||
/// Locate safekeepers for a timeline from the storcon DB.
|
||||
TimelineLocate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
/// Migrate a timeline to a new set of safekeepers
|
||||
TimelineSafekeeperMigrate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
/// Example: --new-sk-set 1,2,3
|
||||
#[arg(long, required = true, value_delimiter = ',')]
|
||||
new_sk_set: Vec<NodeId>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
node_ip_addr: None,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
concurrency,
|
||||
} => {
|
||||
let mut path = format!(
|
||||
"/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
"v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
|
||||
);
|
||||
|
||||
if let Some(c) = concurrency {
|
||||
@@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::POST, path, None)
|
||||
.await?;
|
||||
}
|
||||
Command::TimelineLocate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} => {
|
||||
let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
|
||||
let new_sk_set = resp
|
||||
.new_sk_set
|
||||
.as_ref()
|
||||
.map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
|
||||
|
||||
println!("generation = {}", resp.generation);
|
||||
println!("sk_set = {sk_set:?}");
|
||||
println!("new_sk_set = {new_sk_set:?}");
|
||||
}
|
||||
Command::TimelineSafekeeperMigrate {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
new_sk_set,
|
||||
} => {
|
||||
let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
|
||||
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
path,
|
||||
Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
|
||||
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
|
||||
from WAL.
|
||||
|
||||
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
|
||||
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
|
||||
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
|
||||
This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup
|
||||
code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
|
||||
instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without
|
||||
any WAL redo.
|
||||
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
@@ -75,7 +75,7 @@ CLI examples:
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
For local S3 installations, refer to their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
|
||||
Required sections are:
|
||||
|
||||
@@ -105,11 +105,7 @@ pub struct ComputeSpec {
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
|
||||
// Pageserver information can be passed in two different ways:
|
||||
// 1. Here
|
||||
// 2. in cluster.settings. This is legacy, we are switching to method 1.
|
||||
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
@@ -218,20 +214,6 @@ pub enum ComputeFeature {
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverConnectionInfo {
|
||||
pub shards: HashMap<u32, PageserverShardConnectionInfo>,
|
||||
|
||||
pub prefer_grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardConnectionInfo {
|
||||
pub libpq_url: Option<String>,
|
||||
pub grpc_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -349,12 +331,6 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
|
||||
@@ -6,27 +6,8 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace = true
|
||||
nix.workspace=true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
rustc-hash = { version = "2.1.1" }
|
||||
rand = "0.9.1"
|
||||
libc.workspace = true
|
||||
lock_api = "0.4.13"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { workspace = true, features = ["html_reports"] }
|
||||
rand_distr = "0.5.1"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
|
||||
ahash.workspace = true
|
||||
twox-hash = { version = "2.1.1" }
|
||||
seahash = "4.1.0"
|
||||
hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
|
||||
foldhash = "0.1.5"
|
||||
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
|
||||
[[bench]]
|
||||
name = "hmap_resize"
|
||||
harness = false
|
||||
|
||||
@@ -1,330 +0,0 @@
|
||||
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
|
||||
use neon_shmem::hash::HashMapAccess;
|
||||
use neon_shmem::hash::HashMapInit;
|
||||
use neon_shmem::hash::entry::Entry;
|
||||
use rand::distr::{Distribution, StandardUniform};
|
||||
use rand::prelude::*;
|
||||
use std::default::Default;
|
||||
use std::hash::BuildHasher;
|
||||
|
||||
// Taken from bindings to C code
|
||||
|
||||
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
|
||||
#[repr(C)]
|
||||
pub struct FileCacheKey {
|
||||
pub _spc_id: u32,
|
||||
pub _db_id: u32,
|
||||
pub _rel_number: u32,
|
||||
pub _fork_num: u32,
|
||||
pub _block_num: u32,
|
||||
}
|
||||
|
||||
impl Distribution<FileCacheKey> for StandardUniform {
|
||||
// questionable, but doesn't need to be good randomness
|
||||
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
|
||||
FileCacheKey {
|
||||
_spc_id: rng.random(),
|
||||
_db_id: rng.random(),
|
||||
_rel_number: rng.random(),
|
||||
_fork_num: rng.random(),
|
||||
_block_num: rng.random(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[repr(C)]
|
||||
pub struct FileCacheEntry {
|
||||
pub _offset: u32,
|
||||
pub _access_count: u32,
|
||||
pub _prev: *mut FileCacheEntry,
|
||||
pub _next: *mut FileCacheEntry,
|
||||
pub _state: [u32; 8],
|
||||
}
|
||||
|
||||
impl FileCacheEntry {
|
||||
fn dummy() -> Self {
|
||||
Self {
|
||||
_offset: 0,
|
||||
_access_count: 0,
|
||||
_prev: std::ptr::null_mut(),
|
||||
_next: std::ptr::null_mut(),
|
||||
_state: [0; 8],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Utilities for applying operations.
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp<K, V>(K, Option<V>);
|
||||
|
||||
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
|
||||
op: TestOp<K, V>,
|
||||
map: &mut HashMapAccess<K, V, S>,
|
||||
) {
|
||||
let entry = map.entry(op.0);
|
||||
|
||||
match op.1 {
|
||||
Some(new) => match entry {
|
||||
Entry::Occupied(mut e) => Some(e.insert(new)),
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(new).unwrap();
|
||||
None
|
||||
}
|
||||
},
|
||||
None => match entry {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Hash utilities
|
||||
|
||||
struct SeaRandomState {
|
||||
k1: u64,
|
||||
k2: u64,
|
||||
k3: u64,
|
||||
k4: u64,
|
||||
}
|
||||
|
||||
impl std::hash::BuildHasher for SeaRandomState {
|
||||
type Hasher = seahash::SeaHasher;
|
||||
|
||||
fn build_hasher(&self) -> Self::Hasher {
|
||||
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
|
||||
}
|
||||
}
|
||||
|
||||
impl SeaRandomState {
|
||||
fn new() -> Self {
|
||||
let mut rng = rand::rng();
|
||||
Self {
|
||||
k1: rng.random(),
|
||||
k2: rng.random(),
|
||||
k3: rng.random(),
|
||||
k4: rng.random(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn small_benchs(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("Small maps");
|
||||
group.sample_size(10);
|
||||
|
||||
group.bench_function("small_rehash", |b| {
|
||||
let ideal_filled = 4_000_000;
|
||||
let size = 5_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
});
|
||||
|
||||
group.bench_function("small_rehash_xxhash", |b| {
|
||||
let ideal_filled = 4_000_000;
|
||||
let size = 5_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size * 2)
|
||||
.with_hasher(twox_hash::xxhash64::RandomState::default())
|
||||
.attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
});
|
||||
|
||||
group.bench_function("small_rehash_ahash", |b| {
|
||||
let ideal_filled = 4_000_000;
|
||||
let size = 5_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size * 2)
|
||||
.with_hasher(ahash::RandomState::default())
|
||||
.attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
});
|
||||
|
||||
group.bench_function("small_rehash_seahash", |b| {
|
||||
let ideal_filled = 4_000_000;
|
||||
let size = 5_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size * 2)
|
||||
.with_hasher(SeaRandomState::new())
|
||||
.attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn real_benchs(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("Realistic workloads");
|
||||
group.sample_size(10);
|
||||
group.bench_function("real_bulk_insert", |b| {
|
||||
let size = 125_000_000;
|
||||
let ideal_filled = 100_000_000;
|
||||
let mut rng = rand::rng();
|
||||
b.iter_batched(
|
||||
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|
||||
|writer| {
|
||||
for _ in 0..ideal_filled {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
let entry = writer.entry(key);
|
||||
std::hint::black_box(match entry {
|
||||
Entry::Occupied(mut e) => {
|
||||
e.insert(val);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(val).unwrap();
|
||||
}
|
||||
})
|
||||
}
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
group.bench_function("real_rehash", |b| {
|
||||
let size = 125_000_000;
|
||||
let ideal_filled = 100_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
});
|
||||
|
||||
group.bench_function("real_rehash_hashbrown", |b| {
|
||||
let size = 125_000_000;
|
||||
let ideal_filled = 100_000_000;
|
||||
let mut writer = hashbrown::raw::RawTable::new();
|
||||
let mut rng = rand::rng();
|
||||
let hasher = rustc_hash::FxBuildHasher::default();
|
||||
unsafe {
|
||||
writer
|
||||
.resize(
|
||||
size,
|
||||
|(k, _)| hasher.hash_one(&k),
|
||||
hashbrown::raw::Fallibility::Infallible,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
while writer.len() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
|
||||
hasher.hash_one(&k)
|
||||
});
|
||||
}
|
||||
b.iter(|| unsafe {
|
||||
writer.table.rehash_in_place(
|
||||
&|table, index| {
|
||||
hasher.hash_one(
|
||||
&table
|
||||
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
|
||||
.as_ref()
|
||||
.0,
|
||||
)
|
||||
},
|
||||
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
|
||||
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
|
||||
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)
|
||||
});
|
||||
});
|
||||
|
||||
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("real_rehash_varied", elems),
|
||||
&elems,
|
||||
|b, &size| {
|
||||
let ideal_filled = size * 1_000_000;
|
||||
let size = 125_000_000;
|
||||
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
|
||||
let mut rng = rand::rng();
|
||||
while writer.get_num_buckets_in_use() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
apply_op(TestOp(key, Some(val)), &mut writer);
|
||||
}
|
||||
b.iter(|| writer.shuffle());
|
||||
},
|
||||
);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
|
||||
&elems,
|
||||
|b, &size| {
|
||||
let ideal_filled = size * 1_000_000;
|
||||
let size = 125_000_000;
|
||||
let mut writer = hashbrown::raw::RawTable::new();
|
||||
let mut rng = rand::rng();
|
||||
let hasher = rustc_hash::FxBuildHasher::default();
|
||||
unsafe {
|
||||
writer
|
||||
.resize(
|
||||
size,
|
||||
|(k, _)| hasher.hash_one(&k),
|
||||
hashbrown::raw::Fallibility::Infallible,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
while writer.len() < ideal_filled as usize {
|
||||
let key: FileCacheKey = rng.random();
|
||||
let val = FileCacheEntry::dummy();
|
||||
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
|
||||
hasher.hash_one(&k)
|
||||
});
|
||||
}
|
||||
b.iter(|| unsafe {
|
||||
writer.table.rehash_in_place(
|
||||
&|table, index| {
|
||||
hasher.hash_one(
|
||||
&table
|
||||
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
|
||||
.as_ref()
|
||||
.0,
|
||||
)
|
||||
},
|
||||
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
|
||||
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
|
||||
Some(|ptr| {
|
||||
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
|
||||
})
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, small_benchs, real_benchs);
|
||||
criterion_main!(benches);
|
||||
@@ -1,598 +0,0 @@
|
||||
//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
|
||||
//!
|
||||
//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
|
||||
//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an
|
||||
//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
|
||||
//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
|
||||
//!
|
||||
//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
|
||||
//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
|
||||
//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
|
||||
//! off of the freelist, and then the index of said bucket is placed in the dictionary.
|
||||
//!
|
||||
//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
|
||||
//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
|
||||
//! dictionary by rehashing all keys.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::{BuildHasher, Hash};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
use crate::{shmem, sync::*};
|
||||
|
||||
mod core;
|
||||
pub mod entry;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use core::{Bucket, CoreHashMap, INVALID_POS};
|
||||
use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
|
||||
|
||||
/// This represents a hash table that (possibly) lives in shared memory.
|
||||
/// If a new process is launched with fork(), the child process inherits
|
||||
/// this struct.
|
||||
#[must_use]
|
||||
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
shared_size: usize,
|
||||
hasher: S,
|
||||
num_buckets: u32,
|
||||
}
|
||||
|
||||
impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
|
||||
where
|
||||
K: Debug,
|
||||
V: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("HashMapInit")
|
||||
.field("shmem_handle", &self.shmem_handle)
|
||||
.field("shared_ptr", &self.shared_ptr)
|
||||
.field("shared_size", &self.shared_size)
|
||||
// .field("hasher", &self.hasher)
|
||||
.field("num_buckets", &self.num_buckets)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
|
||||
/// If a child process is launched with fork(), the child process should
|
||||
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
|
||||
///
|
||||
/// XXX: We're not making use of it at the moment, but this struct could
|
||||
/// hold process-local information in the future.
|
||||
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
hasher: S,
|
||||
}
|
||||
|
||||
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
|
||||
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
|
||||
|
||||
impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
|
||||
where
|
||||
K: Debug,
|
||||
V: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("HashMapAccess")
|
||||
.field("shmem_handle", &self.shmem_handle)
|
||||
.field("shared_ptr", &self.shared_ptr)
|
||||
// .field("hasher", &self.hasher)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
|
||||
/// Change the 'hasher' used by the hash table.
|
||||
///
|
||||
/// NOTE: This must be called right after creating the hash table,
|
||||
/// before inserting any entries and before calling attach_writer/reader.
|
||||
/// Otherwise different accessors could be using different hash function,
|
||||
/// with confusing results.
|
||||
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
|
||||
HashMapInit {
|
||||
hasher,
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
shared_size: self.shared_size,
|
||||
num_buckets: self.num_buckets,
|
||||
}
|
||||
}
|
||||
|
||||
/// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
fn new(
|
||||
num_buckets: u32,
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
area_ptr: *mut u8,
|
||||
area_size: usize,
|
||||
hasher: S,
|
||||
) -> Self {
|
||||
let mut ptr: *mut u8 = area_ptr;
|
||||
let end_ptr: *mut u8 = unsafe { ptr.add(area_size) };
|
||||
|
||||
// carve out area for the One Big Lock (TM) and the HashMapShared.
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
|
||||
let raw_lock_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
|
||||
let buckets_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_ptr = ptr;
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
};
|
||||
|
||||
let hashmap = CoreHashMap::new(buckets, dictionary);
|
||||
let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
|
||||
unsafe {
|
||||
std::ptr::write(shared_ptr, lock);
|
||||
}
|
||||
|
||||
Self {
|
||||
num_buckets,
|
||||
shmem_handle,
|
||||
shared_ptr,
|
||||
shared_size: area_size,
|
||||
hasher,
|
||||
}
|
||||
}
|
||||
|
||||
/// Attach to a hash table for writing.
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
hasher: self.hasher,
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
/// Hash table data that is actually stored in the shared memory area.
|
||||
///
|
||||
/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
|
||||
/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
|
||||
/// area as follows:
|
||||
///
|
||||
/// [`libc::pthread_rwlock_t`]
|
||||
/// [`HashMapShared`]
|
||||
/// [buckets]
|
||||
/// [dictionary]
|
||||
///
|
||||
/// In between the above parts, there can be padding bytes to align the parts correctly.
|
||||
type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
/// Place the hash table within a user-supplied fixed memory area.
|
||||
pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
|
||||
Self::new(
|
||||
num_buckets,
|
||||
None,
|
||||
area.as_mut_ptr().cast(),
|
||||
area.len(),
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Place a new hash map in the given shared memory area
|
||||
///
|
||||
/// # Panics
|
||||
/// Will panic on failure to resize area to expected map size.
|
||||
pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
let ptr = shmem.data_ptr.as_ptr().cast();
|
||||
Self::new(
|
||||
num_buckets,
|
||||
Some(shmem),
|
||||
ptr,
|
||||
size,
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Make a resizable hash map within a new shared memory area with the given name.
|
||||
pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
let max_size = Self::estimate_size(max_buckets);
|
||||
let shmem =
|
||||
ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area");
|
||||
let ptr = shmem.data_ptr.as_ptr().cast();
|
||||
|
||||
Self::new(
|
||||
num_buckets,
|
||||
Some(shmem),
|
||||
ptr,
|
||||
size,
|
||||
rustc_hash::FxBuildHasher,
|
||||
)
|
||||
}
|
||||
|
||||
/// Make a resizable hash map within a new anonymous shared memory area.
|
||||
pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
static COUNTER: AtomicUsize = AtomicUsize::new(0);
|
||||
let val = COUNTER.fetch_add(1, Ordering::Relaxed);
|
||||
let name = format!("neon_shmem_hmap{val}");
|
||||
Self::new_resizeable_named(num_buckets, max_buckets, &name)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
/// Hash a key using the map's hasher.
|
||||
#[inline]
|
||||
fn get_hash_value(&self, key: &K) -> u64 {
|
||||
self.hasher.hash_one(key)
|
||||
}
|
||||
|
||||
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
|
||||
let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
|
||||
let dict_pos = hash as usize % map.dictionary.len();
|
||||
let first = map.dictionary[dict_pos];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
|
||||
let mut prev_pos = PrevPos::First(dict_pos as u32);
|
||||
let mut next = first;
|
||||
loop {
|
||||
let bucket = &mut map.buckets[next as usize];
|
||||
let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if *bucket_key == key {
|
||||
// found existing entry
|
||||
return Entry::Occupied(OccupiedEntry {
|
||||
map,
|
||||
_key: key,
|
||||
prev_pos,
|
||||
bucket_pos: next,
|
||||
});
|
||||
}
|
||||
|
||||
if bucket.next == INVALID_POS {
|
||||
// No existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
prev_pos = PrevPos::Chained(next);
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a reference to the corresponding value for a key.
|
||||
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
|
||||
let hash = self.get_hash_value(key);
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
|
||||
}
|
||||
|
||||
/// Get a reference to the entry containing a key.
|
||||
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
|
||||
let hash = self.get_hash_value(&key);
|
||||
self.entry_with_hash(key, hash)
|
||||
}
|
||||
|
||||
/// Remove a key given its hash. Returns the associated value if it existed.
|
||||
pub fn remove(&self, key: &K) -> Option<V> {
|
||||
let hash = self.get_hash_value(&key);
|
||||
match self.entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert/update a key. Returns the previous associated value if it existed.
|
||||
///
|
||||
/// # Errors
|
||||
/// Will return [`core::FullError`] if there is no more space left in the map.
|
||||
pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
|
||||
let hash = self.get_hash_value(&key);
|
||||
match self.entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(value)?;
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Optionally return the entry for a bucket at a given index if it exists.
|
||||
///
|
||||
/// Has more overhead than one would intuitively expect: performs both a clone of the key
|
||||
/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
|
||||
/// to enable repairing the hash chain if the entry is removed.
|
||||
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
if pos >= map.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let entry = map.buckets[pos].inner.as_ref();
|
||||
match entry {
|
||||
Some((key, _)) => Some(OccupiedEntry {
|
||||
_key: key.clone(),
|
||||
bucket_pos: pos as u32,
|
||||
prev_pos: entry::PrevPos::Unknown(self.get_hash_value(&key)),
|
||||
map,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of buckets in the table.
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
map.get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map.
|
||||
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
|
||||
// _slowly_ iterate through all buckets with its clock hand, without holding a lock.
|
||||
// If we switch to an Iterator, it must not hold the lock.
|
||||
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
if pos >= map.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
|
||||
}
|
||||
|
||||
/// Returns the index of the bucket a given value corresponds to.
|
||||
pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
|
||||
let origin = map.buckets.as_ptr();
|
||||
let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
|
||||
assert!(idx < map.buckets.len());
|
||||
|
||||
idx
|
||||
}
|
||||
|
||||
/// Returns the number of occupied buckets in the table.
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
map.buckets_in_use as usize
|
||||
}
|
||||
|
||||
/// Clears all entries in a table. Does not reset any shrinking operations.
|
||||
pub fn clear(&self) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
map.clear();
|
||||
}
|
||||
|
||||
/// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
|
||||
/// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
|
||||
/// in the process.
|
||||
fn rehash_dict(
|
||||
&self,
|
||||
inner: &mut CoreHashMap<'a, K, V>,
|
||||
buckets_ptr: *mut core::Bucket<K, V>,
|
||||
end_ptr: *mut u8,
|
||||
num_buckets: u32,
|
||||
rehash_buckets: u32,
|
||||
) {
|
||||
inner.free_head = INVALID_POS;
|
||||
|
||||
let buckets;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for e in dictionary.iter_mut() {
|
||||
*e = INVALID_POS;
|
||||
}
|
||||
|
||||
for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
|
||||
if bucket.inner.is_none() {
|
||||
bucket.next = inner.free_head;
|
||||
inner.free_head = i as u32;
|
||||
continue;
|
||||
}
|
||||
|
||||
let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
|
||||
let pos: usize = (hash % dictionary.len() as u64) as usize;
|
||||
bucket.next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
inner.dictionary = dictionary;
|
||||
inner.buckets = buckets;
|
||||
}
|
||||
|
||||
/// Rehash the map without growing or shrinking.
|
||||
pub fn shuffle(&self) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
let num_buckets = map.get_num_buckets() as u32;
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
|
||||
}
|
||||
|
||||
/// Grow the number of buckets within the table.
|
||||
///
|
||||
/// 1. Grows the underlying shared memory area
|
||||
/// 2. Initializes new buckets and overwrites the current dictionary
|
||||
/// 3. Rehashes the dictionary
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
|
||||
pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
let old_num_buckets = map.buckets.len() as u32;
|
||||
|
||||
assert!(
|
||||
num_buckets >= old_num_buckets,
|
||||
"grow called with a smaller number of buckets"
|
||||
);
|
||||
if num_buckets == old_num_buckets {
|
||||
return Ok(());
|
||||
}
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("grow called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// Initialize new buckets. The new buckets are linked to the free list.
|
||||
// NB: This overwrites the dictionary!
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
unsafe {
|
||||
for i in old_num_buckets..num_buckets {
|
||||
let bucket = buckets_ptr.add(i as usize);
|
||||
bucket.write(core::Bucket {
|
||||
next: if i < num_buckets - 1 {
|
||||
i + 1
|
||||
} else {
|
||||
map.free_head
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
|
||||
map.free_head = old_num_buckets;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
|
||||
/// greater than the number of buckets in the map.
|
||||
pub fn begin_shrink(&mut self, num_buckets: u32) {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
assert!(
|
||||
num_buckets <= map.get_num_buckets() as u32,
|
||||
"shrink called with a larger number of buckets"
|
||||
);
|
||||
_ = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("shrink called on a fixed-size hash table");
|
||||
map.alloc_limit = num_buckets;
|
||||
}
|
||||
|
||||
/// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
|
||||
pub fn shrink_goal(&self) -> Option<usize> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
|
||||
let goal = map.alloc_limit;
|
||||
if goal == INVALID_POS {
|
||||
None
|
||||
} else {
|
||||
Some(goal as usize)
|
||||
}
|
||||
}
|
||||
|
||||
/// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
|
||||
///
|
||||
/// # Panics
|
||||
/// The following cases result in a panic:
|
||||
/// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
|
||||
/// - Calling this function on a map when no shrink operation is in progress.
|
||||
/// - Calling this function on a map with `shrink_mode` set to [`HashMapShrinkMode::Remap`] and
|
||||
/// there are more buckets in use than the value returned by [`HashMapAccess::shrink_goal`].
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
|
||||
pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
|
||||
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
|
||||
assert!(
|
||||
map.alloc_limit != INVALID_POS,
|
||||
"called finish_shrink when no shrink is in progress"
|
||||
);
|
||||
|
||||
let num_buckets = map.alloc_limit;
|
||||
|
||||
if map.get_num_buckets() == num_buckets as usize {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
assert!(
|
||||
map.buckets_in_use <= num_buckets,
|
||||
"called finish_shrink before enough entries were removed"
|
||||
);
|
||||
|
||||
for i in (num_buckets as usize)..map.buckets.len() {
|
||||
if let Some((k, v)) = map.buckets[i].inner.take() {
|
||||
// alloc_bucket increases count, so need to decrease since we're just moving
|
||||
map.buckets_in_use -= 1;
|
||||
map.alloc_bucket(k, v).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("shrink called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
let buckets_ptr = map.buckets.as_mut_ptr();
|
||||
self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
|
||||
map.alloc_limit = INVALID_POS;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,208 +0,0 @@
|
||||
//! Simple hash table with chaining.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::hash::entry::*;
|
||||
|
||||
/// Invalid position within the map (either within the dictionary or bucket array).
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
|
||||
/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
|
||||
pub(crate) struct Bucket<K, V> {
|
||||
/// Index of next bucket in the chain.
|
||||
pub(crate) next: u32,
|
||||
/// Key-value pair contained within bucket.
|
||||
pub(crate) inner: Option<(K, V)>,
|
||||
}
|
||||
|
||||
impl<K, V> Debug for Bucket<K, V>
|
||||
where
|
||||
K: Debug,
|
||||
V: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Bucket")
|
||||
.field("next", &self.next)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Core hash table implementation.
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
/// Dictionary used to map hashes to bucket indices.
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
/// Buckets containing key-value pairs.
|
||||
pub(crate) buckets: &'a mut [Bucket<K, V>],
|
||||
/// Head of the freelist.
|
||||
pub(crate) free_head: u32,
|
||||
/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
|
||||
pub(crate) alloc_limit: u32,
|
||||
/// The number of currently occupied buckets.
|
||||
pub(crate) buckets_in_use: u32,
|
||||
// pub(crate) lock: libc::pthread_mutex_t,
|
||||
// Unclear what the purpose of this is.
|
||||
pub(crate) _user_list_head: u32,
|
||||
}
|
||||
|
||||
impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
|
||||
where
|
||||
K: Debug,
|
||||
V: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("CoreHashMap")
|
||||
.field("dictionary", &self.dictionary)
|
||||
.field("buckets", &self.buckets)
|
||||
.field("free_head", &self.free_head)
|
||||
.field("alloc_limit", &self.alloc_limit)
|
||||
.field("buckets_in_use", &self.buckets_in_use)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Error for when there are no empty buckets left but one is needed.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct FullError();
|
||||
|
||||
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
/// Estimate the size of data contained within the the hash map.
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += size_of::<Bucket<K, V>>() * num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
|
||||
dictionary: &'a mut [MaybeUninit<u32>],
|
||||
) -> Self {
|
||||
// Initialize the buckets
|
||||
for i in 0..buckets.len() {
|
||||
buckets[i].write(Bucket {
|
||||
next: if i < buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Initialize the dictionary
|
||||
for e in dictionary.iter_mut() {
|
||||
e.write(INVALID_POS);
|
||||
}
|
||||
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
|
||||
};
|
||||
|
||||
Self {
|
||||
dictionary,
|
||||
buckets,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
_user_list_head: INVALID_POS,
|
||||
alloc_limit: INVALID_POS,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the value associated with a key (if it exists) given its hash.
|
||||
pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bucket = &self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(bucket_value);
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get number of buckets in map.
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.buckets.len()
|
||||
}
|
||||
|
||||
/// Clears all entries from the hashmap.
|
||||
///
|
||||
/// Does not reset any allocation limits, but does clear any entries beyond them.
|
||||
pub fn clear(&mut self) {
|
||||
for i in 0..self.buckets.len() {
|
||||
self.buckets[i] = Bucket {
|
||||
next: if i < self.buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
}
|
||||
}
|
||||
for i in 0..self.dictionary.len() {
|
||||
self.dictionary[i] = INVALID_POS;
|
||||
}
|
||||
|
||||
self.free_head = 0;
|
||||
self.buckets_in_use = 0;
|
||||
}
|
||||
|
||||
/// Find the position of an unused bucket via the freelist and initialize it.
|
||||
pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
|
||||
let mut pos = self.free_head;
|
||||
|
||||
// Find the first bucket we're *allowed* to use.
|
||||
let mut prev = PrevPos::First(self.free_head);
|
||||
while pos != INVALID_POS && pos >= self.alloc_limit {
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
prev = PrevPos::Chained(pos);
|
||||
pos = bucket.next;
|
||||
}
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
|
||||
// Repair the freelist.
|
||||
match prev {
|
||||
PrevPos::First(_) => {
|
||||
let next_pos = self.buckets[pos as usize].next;
|
||||
self.free_head = next_pos;
|
||||
}
|
||||
PrevPos::Chained(p) => {
|
||||
if p != INVALID_POS {
|
||||
let next_pos = self.buckets[pos as usize].next;
|
||||
self.buckets[p as usize].next = next_pos;
|
||||
}
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// Initialize the bucket.
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
self.buckets_in_use += 1;
|
||||
bucket.next = INVALID_POS;
|
||||
bucket.inner = Some((key, value));
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
@@ -1,138 +0,0 @@
|
||||
//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
|
||||
|
||||
use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
|
||||
use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::mem;
|
||||
|
||||
pub enum Entry<'a, 'b, K, V> {
|
||||
Occupied(OccupiedEntry<'a, 'b, K, V>),
|
||||
Vacant(VacantEntry<'a, 'b, K, V>),
|
||||
}
|
||||
|
||||
/// Enum representing the previous position within a chain.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum PrevPos {
|
||||
/// Starting index within the dictionary.
|
||||
First(u32),
|
||||
/// Regular index within the buckets.
|
||||
Chained(u32),
|
||||
/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
|
||||
Unknown(u64),
|
||||
}
|
||||
|
||||
pub struct OccupiedEntry<'a, 'b, K, V> {
|
||||
/// Mutable reference to the map containing this entry.
|
||||
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
|
||||
/// The key of the occupied entry
|
||||
pub(crate) _key: K,
|
||||
/// The index of the previous entry in the chain.
|
||||
pub(crate) prev_pos: PrevPos,
|
||||
/// The position of the bucket in the [`CoreHashMap`] bucket array.
|
||||
pub(crate) bucket_pos: u32,
|
||||
}
|
||||
|
||||
impl<K, V> OccupiedEntry<'_, '_, K, V> {
|
||||
pub fn get(&self) -> &V {
|
||||
&self.map.buckets[self.bucket_pos as usize]
|
||||
.inner
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.1
|
||||
}
|
||||
|
||||
pub fn get_mut(&mut self) -> &mut V {
|
||||
&mut self.map.buckets[self.bucket_pos as usize]
|
||||
.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.1
|
||||
}
|
||||
|
||||
/// Inserts a value into the entry, replacing (and returning) the existing value.
|
||||
pub fn insert(&mut self, value: V) -> V {
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
// This assumes inner is Some, which it must be for an OccupiedEntry
|
||||
mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
|
||||
}
|
||||
|
||||
/// Removes the entry from the hash map, returning the value originally stored within it.
|
||||
///
|
||||
/// This may result in multiple bucket accesses if the entry was obtained by index as the
|
||||
/// previous chain entry needs to be discovered in this case.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
|
||||
/// the entry was obtained via calling something like [`CoreHashMap::entry_at_bucket`].
|
||||
pub fn remove(mut self) -> V {
|
||||
// If this bucket was queried by index, go ahead and follow its chain from the start.
|
||||
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
|
||||
let dict_idx = hash as usize % self.map.dictionary.len();
|
||||
let mut prev = PrevPos::First(dict_idx as u32);
|
||||
let mut curr = self.map.dictionary[dict_idx];
|
||||
while curr != self.bucket_pos {
|
||||
assert!(curr != INVALID_POS);
|
||||
prev = PrevPos::Chained(curr);
|
||||
curr = self.map.buckets[curr as usize].next;
|
||||
}
|
||||
prev
|
||||
} else {
|
||||
self.prev_pos
|
||||
};
|
||||
|
||||
// CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
|
||||
// unlink it from the chain
|
||||
match prev {
|
||||
PrevPos::First(dict_pos) => {
|
||||
self.map.dictionary[dict_pos as usize] = bucket.next;
|
||||
}
|
||||
PrevPos::Chained(bucket_pos) => {
|
||||
// println!("we think prev of {} is {bucket_pos}", self.bucket_pos);
|
||||
self.map.buckets[bucket_pos as usize].next = bucket.next;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let free = self.map.free_head;
|
||||
let bucket = &mut self.map.buckets[self.bucket_pos as usize];
|
||||
let old_value = bucket.inner.take();
|
||||
bucket.next = free;
|
||||
self.map.free_head = self.bucket_pos;
|
||||
self.map.buckets_in_use -= 1;
|
||||
|
||||
old_value.unwrap().1
|
||||
}
|
||||
}
|
||||
|
||||
/// An abstract view into a vacant entry within the map.
|
||||
pub struct VacantEntry<'a, 'b, K, V> {
|
||||
/// Mutable reference to the map containing this entry.
|
||||
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
|
||||
/// The key to be inserted into this entry.
|
||||
pub(crate) key: K,
|
||||
/// The position within the dictionary corresponding to the key's hash.
|
||||
pub(crate) dict_pos: u32,
|
||||
}
|
||||
|
||||
impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
|
||||
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
|
||||
///
|
||||
/// # Errors
|
||||
/// Will return [`FullError`] if there are no unoccupied buckets in the map.
|
||||
pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
|
||||
let pos = self.map.alloc_bucket(self.key, value)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
|
||||
self.map.dictionary[self.dict_pos as usize] = pos;
|
||||
|
||||
Ok(RwLockWriteGuard::map(self.map, |m| {
|
||||
&mut m.buckets[pos as usize].inner.as_mut().unwrap().1
|
||||
}))
|
||||
}
|
||||
}
|
||||
@@ -1,429 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::hash::Entry;
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::core::FullError;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_inserts")
|
||||
.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let res = w.entry((*k).into());
|
||||
match res {
|
||||
Entry::Occupied(mut e) => {
|
||||
e.insert(idx);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let res = e.insert(idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let x = w.get(&(*k).into());
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
map: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
let entry = map.entry(op.0);
|
||||
let hash_existing = match op.1 {
|
||||
Some(new) => match entry {
|
||||
Entry::Occupied(mut e) => Some(e.insert(new)),
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(new).unwrap();
|
||||
None
|
||||
}
|
||||
},
|
||||
None => match entry {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
},
|
||||
};
|
||||
|
||||
assert_eq!(shadow_existing, hash_existing);
|
||||
}
|
||||
|
||||
fn do_random_ops(
|
||||
num_ops: usize,
|
||||
size: u32,
|
||||
del_prob: f64,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
rng: &mut rand::rngs::ThreadRng,
|
||||
) {
|
||||
for i in 0..num_ops {
|
||||
let key: TestKey = ((rng.next_u32() % size) as u128).into();
|
||||
let op = TestOp(
|
||||
key,
|
||||
if rng.random_bool(del_prob) {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
apply_op(&op, writer, shadow);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_deletes(
|
||||
num_ops: usize,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
for _ in 0..num_ops {
|
||||
let (k, _) = shadow.pop_first().unwrap();
|
||||
writer.remove(&k);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_shrink(
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
to: u32,
|
||||
) {
|
||||
assert!(writer.shrink_goal().is_none());
|
||||
writer.begin_shrink(to);
|
||||
assert_eq!(writer.shrink_goal(), Some(to as usize));
|
||||
while writer.get_num_buckets_in_use() > to as usize {
|
||||
let (k, _) = shadow.pop_first().unwrap();
|
||||
let entry = writer.entry(k);
|
||||
if let Entry::Occupied(e) = entry {
|
||||
e.remove();
|
||||
}
|
||||
}
|
||||
let old_usage = writer.get_num_buckets_in_use();
|
||||
writer.finish_shrink().unwrap();
|
||||
assert!(writer.shrink_goal().is_none());
|
||||
assert_eq!(writer.get_num_buckets_in_use(), old_usage);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_random")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &mut writer, &mut shadow);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shuffle() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_shuf")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.shuffle();
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, "test_grow")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
let old_usage = writer.get_num_buckets_in_use();
|
||||
writer.grow(1500).unwrap();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), old_usage);
|
||||
assert_eq!(writer.get_num_buckets(), 1500);
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clear() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.clear();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 0);
|
||||
assert_eq!(writer.get_num_buckets(), 1500);
|
||||
while let Some((key, _)) = shadow.pop_first() {
|
||||
assert!(writer.get(&key).is_none());
|
||||
}
|
||||
do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
for i in 0..(1500 - writer.get_num_buckets_in_use()) {
|
||||
writer.insert((1500 + i as u128).into(), 0).unwrap();
|
||||
}
|
||||
assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
|
||||
writer.clear();
|
||||
assert!(writer.insert(5000.into(), 0).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_idx_remove() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
for _ in 0..100 {
|
||||
let idx = (rng.next_u32() % 1500) as usize;
|
||||
if let Some(e) = writer.entry_at_bucket(idx) {
|
||||
shadow.remove(&e._key);
|
||||
e.remove();
|
||||
}
|
||||
}
|
||||
while let Some((key, val)) = shadow.pop_first() {
|
||||
assert_eq!(*writer.get(&key).unwrap(), val);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_idx_get() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
for _ in 0..100 {
|
||||
let idx = (rng.next_u32() % 1500) as usize;
|
||||
if let Some(pair) = writer.get_at_bucket(idx) {
|
||||
{
|
||||
let v: *const usize = &pair.1;
|
||||
assert_eq!(writer.get_bucket_for_value(v), idx);
|
||||
}
|
||||
{
|
||||
let v: *const usize = &pair.1;
|
||||
assert_eq!(writer.get_bucket_for_value(v), idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
do_shrink(&mut writer, &mut shadow, 1000);
|
||||
assert_eq!(writer.get_num_buckets(), 1000);
|
||||
do_deletes(500, &mut writer, &mut shadow);
|
||||
do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
assert!(writer.get_num_buckets_in_use() <= 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_grow_seq() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
|
||||
.attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 750");
|
||||
do_shrink(&mut writer, &mut shadow, 750);
|
||||
do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 1500");
|
||||
writer.grow(1500).unwrap();
|
||||
do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 200");
|
||||
while shadow.len() > 100 {
|
||||
do_deletes(1, &mut writer, &mut shadow);
|
||||
}
|
||||
do_shrink(&mut writer, &mut shadow, 200);
|
||||
do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 10k");
|
||||
writer.grow(10000).unwrap();
|
||||
do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bucket_ops() {
|
||||
let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_bucket_ops")
|
||||
.attach_writer();
|
||||
match writer.entry(1.into()) {
|
||||
Entry::Occupied(mut e) => {
|
||||
e.insert(2);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
_ = e.insert(2).unwrap();
|
||||
}
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
assert_eq!(writer.get_num_buckets(), 1000);
|
||||
assert_eq!(*writer.get(&1.into()).unwrap(), 2);
|
||||
let pos = match writer.entry(1.into()) {
|
||||
Entry::Occupied(e) => {
|
||||
assert_eq!(e._key, 1.into());
|
||||
let pos = e.bucket_pos as usize;
|
||||
pos
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
panic!("Insert didn't affect entry");
|
||||
}
|
||||
};
|
||||
assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
|
||||
assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
|
||||
{
|
||||
let ptr: *const usize = &*writer.get(&1.into()).unwrap();
|
||||
assert_eq!(writer.get_bucket_for_value(ptr), pos);
|
||||
}
|
||||
writer.remove(&1.into());
|
||||
assert!(writer.get(&1.into()).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_zero() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
|
||||
.attach_writer();
|
||||
writer.begin_shrink(0);
|
||||
for i in 0..1500 {
|
||||
writer.entry_at_bucket(i).map(|x| x.remove());
|
||||
}
|
||||
writer.finish_shrink().unwrap();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 0);
|
||||
let entry = writer.entry(1.into());
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_err());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
writer.grow(50).unwrap();
|
||||
let entry = writer.entry(1.into());
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_ok());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_grow_oom() {
|
||||
let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
|
||||
.attach_writer();
|
||||
writer.grow(20000).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_bigger() {
|
||||
let mut writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
|
||||
.attach_writer();
|
||||
writer.begin_shrink(2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_early_finish() {
|
||||
let writer =
|
||||
HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
|
||||
.attach_writer();
|
||||
writer.finish_shrink().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_fixed_size() {
|
||||
let mut area = [MaybeUninit::uninit(); 10000];
|
||||
let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
|
||||
let mut writer = init_struct.attach_writer();
|
||||
writer.begin_shrink(1);
|
||||
}
|
||||
@@ -1,5 +1,418 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
pub mod sync;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {max_size} too large");
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {i}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,411 +0,0 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
|
||||
/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
|
||||
/// future.
|
||||
#[derive(Debug)]
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
#[derive(Debug)]
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the [`ShmemHandle`] functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Self {
|
||||
Self {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
|
||||
///
|
||||
/// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
|
||||
// We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
assert!(max_size < 1 << 48, "max size {max_size} too large");
|
||||
|
||||
assert!(
|
||||
initial_size <= max_size,
|
||||
"initial size {initial_size} larger than max size {max_size}"
|
||||
);
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
});
|
||||
}
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(Self {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an [`shmem::Error`](Error).
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
assert!(
|
||||
new_size <= self.max_size,
|
||||
"new size ({new_size}) is greater than max size ({})",
|
||||
self.max_size
|
||||
);
|
||||
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in `current_size`
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry.
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64)
|
||||
.map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
|
||||
/// It is the caller's responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// Disable unused variables warnings because `name` is unused in the macos path.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {i}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,104 +0,0 @@
|
||||
//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ptr::NonNull;
|
||||
|
||||
use nix::errno::Errno;
|
||||
|
||||
pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
|
||||
pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
|
||||
pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
|
||||
pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
|
||||
pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
|
||||
|
||||
/// Shared memory read-write lock.
|
||||
pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
|
||||
|
||||
impl PthreadRwLock {
|
||||
pub fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
|
||||
unsafe {
|
||||
let mut attrs = MaybeUninit::uninit();
|
||||
// Ignoring return value here - only possible error is OOM.
|
||||
libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
|
||||
libc::pthread_rwlockattr_setpshared(attrs.as_mut_ptr(), libc::PTHREAD_PROCESS_SHARED);
|
||||
// TODO(quantumish): worth making this function return Result?
|
||||
libc::pthread_rwlock_init(lock, attrs.as_mut_ptr());
|
||||
// Safety: POSIX specifies that "any function affecting the attributes
|
||||
// object (including destruction) shall not affect any previously
|
||||
// initialized read-write locks".
|
||||
libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
|
||||
Self(Some(NonNull::new_unchecked(lock)))
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
|
||||
match self.0 {
|
||||
None => {
|
||||
panic!("PthreadRwLock constructed badly - something likely used RawMutex::INIT")
|
||||
}
|
||||
Some(x) => x,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl lock_api::RawRwLock for PthreadRwLock {
|
||||
type GuardMarker = lock_api::GuardSend;
|
||||
const INIT: Self = Self(None);
|
||||
|
||||
fn lock_shared(&self) {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
|
||||
if res != 0 {
|
||||
panic!("rdlock failed with {}", Errno::from_raw(res));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_lock_shared(&self) -> bool {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
|
||||
match res {
|
||||
0 => true,
|
||||
libc::EAGAIN => false,
|
||||
_ => panic!("try_rdlock failed with {}", Errno::from_raw(res)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn lock_exclusive(&self) {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
|
||||
if res != 0 {
|
||||
panic!("wrlock failed with {}", Errno::from_raw(res));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_lock_exclusive(&self) -> bool {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
|
||||
match res {
|
||||
0 => true,
|
||||
libc::EAGAIN => false,
|
||||
_ => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlock_exclusive(&self) {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
|
||||
if res != 0 {
|
||||
panic!("unlock failed with {}", Errno::from_raw(res));
|
||||
}
|
||||
}
|
||||
}
|
||||
unsafe fn unlock_shared(&self) {
|
||||
unsafe {
|
||||
let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
|
||||
if res != 0 {
|
||||
panic!("unlock failed with {}", Errno::from_raw(res));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
crossbeam-utils.workspace = true
|
||||
spin.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
@@ -1,599 +0,0 @@
|
||||
mod lock_and_version;
|
||||
pub(crate) mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
|
||||
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
use crate::TreeWriteGuard;
|
||||
use crate::UpdateAction;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ArtError {
|
||||
ConcurrentUpdate, // need to retry
|
||||
OutOfMemory,
|
||||
}
|
||||
|
||||
impl From<ConcurrentUpdateError> for ArtError {
|
||||
fn from(_: ConcurrentUpdateError) -> ArtError {
|
||||
ArtError::ConcurrentUpdate
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OutOfMemoryError> for ArtError {
|
||||
fn from(_: OutOfMemoryError) -> ArtError {
|
||||
ArtError::OutOfMemory
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(
|
||||
allocator: &impl ArtAllocator<V>,
|
||||
) -> Result<RootPtr<V>, OutOfMemoryError> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<&'e V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter_next<'e, V: Value>(
|
||||
key: &[u8],
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<(Vec<u8>, &'e V)> {
|
||||
loop {
|
||||
let mut path = Vec::new();
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
match next_recurse(key, &mut path, root_ref, epoch_pin) {
|
||||
Ok(Some(v)) => {
|
||||
assert_eq!(path.len(), key.len());
|
||||
break Some((path, v));
|
||||
}
|
||||
Ok(None) => break None,
|
||||
Err(ConcurrentUpdateError()) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
|
||||
match update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
None,
|
||||
guard,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(ArtError::ConcurrentUpdate) => {
|
||||
continue; // retry
|
||||
}
|
||||
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if the prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), prefix_len);
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_recurse<'e, V: Value>(
|
||||
min_key: &[u8],
|
||||
path: &mut Vec<u8>,
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let prefix = rnode.get_prefix();
|
||||
if !prefix.is_empty() {
|
||||
path.extend_from_slice(prefix);
|
||||
}
|
||||
|
||||
use std::cmp::Ordering;
|
||||
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
|
||||
if comparison == Ordering::Less {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(path.len(), min_key.len());
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let mut min_key_byte = match comparison {
|
||||
Ordering::Less => unreachable!(), // checked this above already
|
||||
Ordering::Equal => min_key[path.len()],
|
||||
Ordering::Greater => 0,
|
||||
};
|
||||
|
||||
loop {
|
||||
match rnode.find_next_child_or_restart(min_key_byte)? {
|
||||
None => {
|
||||
return Ok(None);
|
||||
}
|
||||
Some((key_byte, child_ref)) => {
|
||||
let path_len = path.len();
|
||||
path.push(key_byte);
|
||||
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
|
||||
if result.is_some() {
|
||||
return Ok(result);
|
||||
}
|
||||
if key_byte == u8::MAX {
|
||||
return Ok(None);
|
||||
}
|
||||
path.truncate(path_len);
|
||||
min_key_byte = key_byte + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> Result<(), ArtError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len..];
|
||||
let level = level + prefix_match_len;
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), 0);
|
||||
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value. XXX: There might be concurrent reads though?
|
||||
let value_mut = wnode.get_leaf_value_mut();
|
||||
|
||||
match value_fn(Some(value_mut)) {
|
||||
UpdateAction::Nothing => {
|
||||
wparent.write_unlock();
|
||||
wnode.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
|
||||
UpdateAction::Remove => {
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wparent.delete_child(parent_key);
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
if let Some(rgrandparent) = rgrandparent {
|
||||
// FIXME: Ignore concurrency error. It doesn't lead to
|
||||
// corruption, but it means we might leak something. Until
|
||||
// another update cleans it up.
|
||||
let _ = cleanup_parent(wparent, rgrandparent, guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_to_node(&mut wnode, key, new_value, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
wnode.write_unlock();
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
let next_child = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((ref rparent, _)) = rparent {
|
||||
rparent.check_or_restart()?;
|
||||
}
|
||||
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
rparent,
|
||||
guard,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'_ EpochPin,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
|
||||
}
|
||||
|
||||
// TODO: return an Err if writeln!() returns error, instead of unwrapping
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if !prefix.is_empty() {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let val = unsafe { vptr.as_ref().unwrap() };
|
||||
writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for key_byte in 0..=u8::MAX {
|
||||
match rnode.find_child_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(child_ref) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
writeln!(
|
||||
dst,
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
guard: &'_ TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(
|
||||
&key[common_prefix_len + 1..],
|
||||
value,
|
||||
guard.tree_writer.allocator,
|
||||
)?;
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
// FIXME: deallocate 'new_value_node' on OOM
|
||||
let mut prefix_node =
|
||||
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
|
||||
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
guard: &'_ TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
wnode.insert_child(key[0], value_child.into_ptr());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
|
||||
|
||||
// FIXME: deallocate 'bigger_node' on OOM
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
bigger_node.insert_new_child(key[0], value_child);
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wparent: WriteLockedNodeRef<V>,
|
||||
rgrandparent: (ReadLockedNodeRef<V>, u8),
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let (rgrandparent, grandparent_key_byte) = rgrandparent;
|
||||
|
||||
// If the parent becomes completely empty after the deletion, remove the parent from the
|
||||
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
|
||||
// TODO: not implemented.
|
||||
|
||||
// If the parent has only one child, replace the parent with the remaining child. (This is not
|
||||
// possible if the child's prefix field cannot absorb the parent's)
|
||||
if wparent.num_children() == 1 {
|
||||
// Try to lock the remaining child. This can fail if the child is updated
|
||||
// concurrently.
|
||||
let (key_byte, remaining_child) = wparent.find_remaining_child();
|
||||
|
||||
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
|
||||
|
||||
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
|
||||
// remaining leaf. Proceed with the updates.
|
||||
|
||||
// Update the prefix on the remaining leaf
|
||||
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
|
||||
|
||||
// Replace the pointer in the grandparent to point directly to the remaining leaf
|
||||
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
|
||||
|
||||
// Mark the parent as deleted.
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the parent's children would fit on a smaller node type after the deletion, replace it with
|
||||
// a smaller node.
|
||||
if wparent.can_shrink() {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
|
||||
|
||||
// Replace the pointer in the grandparent
|
||||
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// nothing to do
|
||||
wparent.write_unlock();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If the key is long, we
|
||||
// may need to allocate new internal nodes to hold it too
|
||||
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
|
||||
|
||||
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
)?;
|
||||
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
@@ -1,117 +0,0 @@
|
||||
//! Each node in the tree has contains one atomic word that stores three things:
|
||||
//!
|
||||
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
|
||||
//! but might still be accessed by concurrent readers until the epoch expires.
|
||||
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
|
||||
//! Bits 2-63: Version number, incremented every time the node is modified.
|
||||
//!
|
||||
//! AtomicLockAndVersion represents that.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct ConcurrentUpdateError();
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
&self,
|
||||
version: u64,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
let old = self.inner.load(Ordering::Relaxed);
|
||||
if is_obsolete(old) || is_locked(old) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
old,
|
||||
set_locked_bit(old),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while is_locked(version) {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
version + 2
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
(version & 1) == 1
|
||||
}
|
||||
|
||||
fn is_locked(version: u64) -> bool {
|
||||
(version & 2) == 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,349 +0,0 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::Value;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.lockword().write_lock_or_restart()?;
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_child(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(child_ptr) => Ok(Some(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_next_child_or_restart(
|
||||
&self,
|
||||
min_key_byte: u8,
|
||||
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_next_child(min_key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some((k, child_ptr)) => Ok(Some((
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
|
||||
let result = self.ptr.get_leaf_value();
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
// Extend the lifetime.
|
||||
let result = std::ptr::from_ref(result);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn can_shrink(&self) -> bool {
|
||||
self.ptr.can_shrink()
|
||||
}
|
||||
|
||||
pub(crate) fn num_children(&self) -> usize {
|
||||
self.ptr.num_children()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
|
||||
self.ptr.prepend_prefix(prefix, prefix_byte)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
|
||||
self.ptr.get_leaf_value_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn grow<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.grow(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn shrink<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.shrink(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
|
||||
pub(crate) fn delete_child(&mut self, key_byte: u8) {
|
||||
self.ptr.delete_child(key_byte);
|
||||
}
|
||||
|
||||
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
|
||||
assert_eq!(self.num_children(), 1);
|
||||
let child_or_value = self.ptr.find_next_child(0);
|
||||
|
||||
match child_or_value {
|
||||
None => panic!("could not find only child in node"),
|
||||
Some((k, child_ptr)) => (
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
ptr: NodePtr<V>,
|
||||
allocator: &'a A,
|
||||
|
||||
extra_nodes: Vec<NodePtr<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V, A> NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
|
||||
self.ptr.insert_child(key_byte, child.as_ptr())
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
self.ptr = NodePtr::null();
|
||||
ptr
|
||||
}
|
||||
|
||||
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
|
||||
let child_ptr = child.into_ptr();
|
||||
self.ptr.insert_child(key_byte, child_ptr);
|
||||
self.extra_nodes.push(child_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.deallocate(self.allocator);
|
||||
for p in self.extra_nodes.iter() {
|
||||
p.deallocate(self.allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
pub mod block;
|
||||
mod multislab;
|
||||
mod slab;
|
||||
pub mod r#static;
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crate::allocator::multislab::MultiSlabAllocator;
|
||||
use crate::allocator::r#static::alloc_from_slice;
|
||||
|
||||
use spin;
|
||||
|
||||
use crate::Tree;
|
||||
pub use crate::algorithm::node_ptr::{
|
||||
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub trait ArtAllocator<V: crate::Value> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V>;
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
|
||||
}
|
||||
|
||||
pub struct ArtMultiSlabAllocator<'t, V>
|
||||
where
|
||||
V: crate::Value,
|
||||
{
|
||||
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
|
||||
|
||||
pub(crate) inner: MultiSlabAllocator<'t, 5>,
|
||||
|
||||
phantom_val: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
const LAYOUTS: [Layout; 5] = [
|
||||
Layout::new::<NodeInternal4<V>>(),
|
||||
Layout::new::<NodeInternal16<V>>(),
|
||||
Layout::new::<NodeInternal48<V>>(),
|
||||
Layout::new::<NodeInternal256<V>>(),
|
||||
Layout::new::<NodeLeaf<V>>(),
|
||||
];
|
||||
|
||||
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
|
||||
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
|
||||
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
|
||||
|
||||
allocator_area.write(ArtMultiSlabAllocator {
|
||||
tree_area: spin::Mutex::new(Some(tree_area)),
|
||||
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
|
||||
phantom_val: PhantomData,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V> {
|
||||
let mut t = self.tree_area.lock();
|
||||
if let Some(tree_area) = t.take() {
|
||||
return tree_area.as_mut_ptr().cast();
|
||||
}
|
||||
panic!("cannot allocate more than one tree");
|
||||
}
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
|
||||
self.inner.alloc_slab(0).cast()
|
||||
}
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
|
||||
self.inner.alloc_slab(1).cast()
|
||||
}
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
|
||||
self.inner.alloc_slab(2).cast()
|
||||
}
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
|
||||
self.inner.alloc_slab(3).cast()
|
||||
}
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
|
||||
self.inner.alloc_slab(4).cast()
|
||||
}
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
|
||||
self.inner.dealloc_slab(0, ptr.cast())
|
||||
}
|
||||
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
|
||||
self.inner.dealloc_slab(1, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
|
||||
self.inner.dealloc_slab(2, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
|
||||
self.inner.dealloc_slab(3, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
|
||||
self.inner.dealloc_slab(4, ptr.cast())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
|
||||
ArtMultiSlabStats {
|
||||
num_internal4: self.inner.slab_descs[0]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal16: self.inner.slab_descs[1]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal48: self.inner.slab_descs[2]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal256: self.inner.slab_descs[3]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_leaf: self.inner.slab_descs[4]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
|
||||
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtMultiSlabStats {
|
||||
pub num_internal4: u64,
|
||||
pub num_internal16: u64,
|
||||
pub num_internal48: u64,
|
||||
pub num_internal256: u64,
|
||||
pub num_leaf: u64,
|
||||
|
||||
pub num_blocks_internal4: u64,
|
||||
pub num_blocks_internal16: u64,
|
||||
pub num_blocks_internal48: u64,
|
||||
pub num_blocks_internal256: u64,
|
||||
pub num_blocks_leaf: u64,
|
||||
}
|
||||
@@ -1,191 +0,0 @@
|
||||
//! Simple allocator of fixed-size blocks
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
pub const BLOCK_SIZE: usize = 16 * 1024;
|
||||
|
||||
const INVALID_BLOCK: u64 = u64::MAX;
|
||||
|
||||
pub(crate) struct BlockAllocator<'t> {
|
||||
blocks_ptr: &'t [MaybeUninit<u8>],
|
||||
num_blocks: u64,
|
||||
num_initialized: AtomicU64,
|
||||
|
||||
freelist_head: spin::Mutex<u64>,
|
||||
}
|
||||
|
||||
struct FreeListBlock {
|
||||
inner: spin::Mutex<FreeListBlockInner>,
|
||||
}
|
||||
|
||||
struct FreeListBlockInner {
|
||||
next: u64,
|
||||
|
||||
num_free_blocks: u64,
|
||||
free_blocks: [u64; 100], // FIXME: fill the rest of the block
|
||||
}
|
||||
|
||||
impl<'t> BlockAllocator<'t> {
|
||||
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
|
||||
// Use all the space for the blocks
|
||||
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
|
||||
let remain = &mut area[padding..];
|
||||
|
||||
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
|
||||
|
||||
BlockAllocator {
|
||||
blocks_ptr: remain,
|
||||
num_blocks,
|
||||
num_initialized: AtomicU64::new(0),
|
||||
freelist_head: spin::Mutex::new(INVALID_BLOCK),
|
||||
}
|
||||
}
|
||||
|
||||
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
|
||||
/// reused for another kind of block
|
||||
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
|
||||
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
|
||||
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
|
||||
assert!(blkno < self.num_blocks);
|
||||
unsafe {
|
||||
self.blocks_ptr
|
||||
.as_ptr()
|
||||
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
|
||||
}
|
||||
.cast_mut()
|
||||
.cast()
|
||||
}
|
||||
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
|
||||
// FIXME: handle OOM
|
||||
let blkno = self.alloc_block_internal();
|
||||
if blkno == INVALID_BLOCK {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
|
||||
}
|
||||
|
||||
fn alloc_block_internal(&self) -> u64 {
|
||||
// check the free list.
|
||||
{
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
if g.num_free_blocks > 0 {
|
||||
g.num_free_blocks -= 1;
|
||||
let result = g.free_blocks[g.num_free_blocks as usize];
|
||||
return result;
|
||||
} else {
|
||||
// consume the freelist block itself
|
||||
let result = *freelist_head;
|
||||
*freelist_head = g.next;
|
||||
// This freelist block is now unlinked and can be repurposed
|
||||
drop(g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are some blocks left that we've never used, pick next such block
|
||||
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
|
||||
while next_uninitialized < self.num_blocks {
|
||||
match self.num_initialized.compare_exchange(
|
||||
next_uninitialized,
|
||||
next_uninitialized + 1,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
return next_uninitialized;
|
||||
}
|
||||
Err(old) => {
|
||||
next_uninitialized = old;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out of blocks
|
||||
INVALID_BLOCK
|
||||
}
|
||||
|
||||
// TODO: this is currently unused. The slab allocator never releases blocks
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
|
||||
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
|
||||
self.release_block_internal(blockno as u64);
|
||||
}
|
||||
|
||||
fn release_block_internal(&self, blockno: u64) {
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
let num_free_blocks = g.num_free_blocks;
|
||||
if num_free_blocks < g.free_blocks.len() as u64 {
|
||||
g.free_blocks[num_free_blocks as usize] = blockno;
|
||||
g.num_free_blocks += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the block into a new freelist block
|
||||
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
|
||||
let init = FreeListBlock {
|
||||
inner: spin::Mutex::new(FreeListBlockInner {
|
||||
next: *freelist_head,
|
||||
num_free_blocks: 0,
|
||||
free_blocks: [INVALID_BLOCK; 100],
|
||||
}),
|
||||
};
|
||||
unsafe { (*block_ptr) = init };
|
||||
*freelist_head = blockno;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
|
||||
let mut num_free_blocks = 0;
|
||||
|
||||
let mut _prev_lock = None;
|
||||
let head_lock = self.freelist_head.lock();
|
||||
let mut next_blk = *head_lock;
|
||||
let mut _head_lock = Some(head_lock);
|
||||
while next_blk != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(next_blk);
|
||||
let lock = freelist_block.inner.lock();
|
||||
num_free_blocks += lock.num_free_blocks;
|
||||
next_blk = lock.next;
|
||||
_prev_lock = Some(lock); // hold the lock until we've read the next block
|
||||
_head_lock = None;
|
||||
}
|
||||
|
||||
BlockAllocatorStats {
|
||||
num_blocks: self.num_blocks,
|
||||
num_initialized: self.num_initialized.load(Ordering::Relaxed),
|
||||
num_free_blocks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BlockAllocatorStats {
|
||||
pub num_blocks: u64,
|
||||
pub num_initialized: u64,
|
||||
pub num_free_blocks: u64,
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::allocator::block::BlockAllocator;
|
||||
use crate::allocator::slab::SlabDesc;
|
||||
|
||||
pub struct MultiSlabAllocator<'t, const N: usize> {
|
||||
pub(crate) block_allocator: BlockAllocator<'t>,
|
||||
|
||||
pub(crate) slab_descs: [SlabDesc; N],
|
||||
}
|
||||
|
||||
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
|
||||
pub(crate) fn new(
|
||||
area: &'t mut [MaybeUninit<u8>],
|
||||
layouts: &[Layout; N],
|
||||
) -> MultiSlabAllocator<'t, N> {
|
||||
let block_allocator = BlockAllocator::new(area);
|
||||
MultiSlabAllocator {
|
||||
block_allocator,
|
||||
|
||||
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
|
||||
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
|
||||
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
|
||||
}
|
||||
}
|
||||
@@ -1,433 +0,0 @@
|
||||
//! A slab allocator that carves out fixed-size chunks from larger blocks.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
use super::alloc_from_slice;
|
||||
use super::block::BlockAllocator;
|
||||
|
||||
use crate::allocator::block::BLOCK_SIZE;
|
||||
|
||||
pub(crate) struct SlabDesc {
|
||||
pub(crate) layout: Layout,
|
||||
|
||||
block_lists: spin::RwLock<BlockLists>,
|
||||
|
||||
pub(crate) num_blocks: AtomicU64,
|
||||
pub(crate) num_allocated: AtomicU64,
|
||||
}
|
||||
|
||||
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
|
||||
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
|
||||
// the art tree, SlabDescs are only moved during initialization.
|
||||
unsafe impl Sync for SlabDesc {}
|
||||
unsafe impl Send for SlabDesc {}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct BlockLists {
|
||||
full_blocks: BlockList,
|
||||
nonfull_blocks: BlockList,
|
||||
}
|
||||
|
||||
impl BlockLists {
|
||||
// Unlink a node. It must be in either one of the two lists.
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
let list = unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
if self.full_blocks.tail == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else if (*elem).prev.is_null() {
|
||||
if self.full_blocks.head == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
unsafe { unlink_slab_block(list, elem) };
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().tail, elem);
|
||||
list.as_mut().unwrap().tail = (*elem).prev;
|
||||
} else {
|
||||
assert_eq!((*(*elem).next).prev, elem);
|
||||
(*(*elem).next).prev = (*elem).prev;
|
||||
}
|
||||
if (*elem).prev.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().head, elem);
|
||||
list.as_mut().unwrap().head = (*elem).next;
|
||||
} else {
|
||||
assert_eq!((*(*elem).prev).next, elem);
|
||||
(*(*elem).prev).next = (*elem).next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BlockList {
|
||||
head: *mut SlabBlockHeader,
|
||||
tail: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
impl Default for BlockList {
|
||||
fn default() -> Self {
|
||||
BlockList {
|
||||
head: std::ptr::null_mut(),
|
||||
tail: std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockList {
|
||||
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if self.is_empty() {
|
||||
self.tail = elem;
|
||||
(*elem).next = std::ptr::null_mut();
|
||||
} else {
|
||||
(*elem).next = self.head;
|
||||
(*self.head).prev = elem;
|
||||
}
|
||||
(*elem).prev = std::ptr::null_mut();
|
||||
self.head = elem;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.head.is_null()
|
||||
}
|
||||
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe { unlink_slab_block(Some(self), elem) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
let mut next = self.head;
|
||||
|
||||
while !next.is_null() {
|
||||
let n = unsafe { next.as_ref() }.unwrap();
|
||||
eprintln!(
|
||||
" blk {:?} (free {}/{})",
|
||||
next,
|
||||
n.num_free_chunks.load(Ordering::Relaxed),
|
||||
n.num_chunks
|
||||
);
|
||||
next = n.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub(crate) fn new(layout: &Layout) -> SlabDesc {
|
||||
SlabDesc {
|
||||
layout: *layout,
|
||||
block_lists: spin::RwLock::new(BlockLists::default()),
|
||||
num_allocated: AtomicU64::new(0),
|
||||
num_blocks: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex<*mut FreeChunk>,
|
||||
num_free_chunks: AtomicU32,
|
||||
num_chunks: u32, // this is really a constant for a given Layout
|
||||
|
||||
// these fields are protected by the lock on the BlockLists
|
||||
prev: *mut SlabBlockHeader,
|
||||
next: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
struct FreeChunk {
|
||||
next: *mut FreeChunk,
|
||||
}
|
||||
|
||||
enum ReadOrWriteGuard<'a, T> {
|
||||
Read(spin::RwLockReadGuard<'a, T>),
|
||||
Write(spin::RwLockWriteGuard<'a, T>),
|
||||
}
|
||||
|
||||
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
match self {
|
||||
ReadOrWriteGuard::Read(g) => g.deref(),
|
||||
ReadOrWriteGuard::Write(g) => g.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
|
||||
// Are there any free chunks?
|
||||
let mut acquire_write = false;
|
||||
'outer: loop {
|
||||
let mut block_lists_guard = if acquire_write {
|
||||
ReadOrWriteGuard::Write(self.block_lists.write())
|
||||
} else {
|
||||
ReadOrWriteGuard::Read(self.block_lists.read())
|
||||
};
|
||||
'inner: loop {
|
||||
let block_ptr = block_lists_guard.nonfull_blocks.head;
|
||||
if block_ptr.is_null() {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
if !(*free_chunks_head).is_null() {
|
||||
let result = *free_chunks_head;
|
||||
(*free_chunks_head) = (*result).next;
|
||||
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
return result.cast();
|
||||
}
|
||||
}
|
||||
|
||||
// The block at the head of the list was full. Grab write lock and retry
|
||||
match block_lists_guard {
|
||||
ReadOrWriteGuard::Read(_) => {
|
||||
acquire_write = true;
|
||||
continue 'outer;
|
||||
}
|
||||
ReadOrWriteGuard::Write(ref mut g) => {
|
||||
// move the node to the list of full blocks
|
||||
unsafe {
|
||||
g.nonfull_blocks.unlink(block_ptr);
|
||||
g.full_blocks.push_head(block_ptr);
|
||||
};
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no free chunks. Allocate a new block (and the chunk from that)
|
||||
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
|
||||
self.num_blocks.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Add the block to the list in the SlabDesc
|
||||
unsafe {
|
||||
let mut block_lists_guard = self.block_lists.write();
|
||||
block_lists_guard.nonfull_blocks.push_head(new_block);
|
||||
}
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
new_chunk
|
||||
}
|
||||
|
||||
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
|
||||
// Find the block it belongs to. You can find the block from the address. (And knowing the
|
||||
// layout, you could calculate the chunk number too.)
|
||||
let block_ptr: *mut SlabBlockHeader = {
|
||||
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
|
||||
chunk_ptr.with_addr(block_addr).cast()
|
||||
};
|
||||
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
|
||||
|
||||
// Mark the chunk as free in 'freechunks' list
|
||||
let num_chunks;
|
||||
let num_free_chunks;
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
(*chunk_ptr).next = *free_chunks_head;
|
||||
*free_chunks_head = chunk_ptr;
|
||||
|
||||
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
num_chunks = (*block_ptr).num_chunks;
|
||||
}
|
||||
|
||||
if num_free_chunks == 1 {
|
||||
// If the block was full previously, add it to the nonfull blocks list. Note that
|
||||
// we're not holding the lock anymore, so it can immediately become full again.
|
||||
// That's harmless, it will be moved back to the full list again when a call
|
||||
// to alloc_chunk() sees it.
|
||||
let mut block_lists = self.block_lists.write();
|
||||
unsafe {
|
||||
block_lists.unlink(block_ptr);
|
||||
block_lists.nonfull_blocks.push_head(block_ptr);
|
||||
};
|
||||
} else if num_free_chunks == num_chunks {
|
||||
// If the block became completely empty, move it to the free list
|
||||
// TODO
|
||||
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
|
||||
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
|
||||
//block_allocator.release_block()
|
||||
}
|
||||
|
||||
// update stats
|
||||
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn alloc_block_and_chunk(
|
||||
&self,
|
||||
block_allocator: &BlockAllocator,
|
||||
) -> (*mut SlabBlockHeader, *mut u8) {
|
||||
// fixme: handle OOM
|
||||
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
|
||||
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
|
||||
|
||||
let padding = remain.as_ptr().align_offset(self.layout.align());
|
||||
|
||||
let num_chunks = (remain.len() - padding) / self.layout.size();
|
||||
|
||||
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
|
||||
|
||||
unsafe {
|
||||
let mut chunk_ptr = first_chunk_ptr;
|
||||
for _ in 0..num_chunks - 1 {
|
||||
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
|
||||
(*chunk_ptr).next = next_chunk_ptr;
|
||||
chunk_ptr = next_chunk_ptr;
|
||||
}
|
||||
(*chunk_ptr).next = std::ptr::null_mut();
|
||||
|
||||
let result_chunk = first_chunk_ptr;
|
||||
|
||||
let block_header = block_header.write(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
num_chunks: num_chunks as u32,
|
||||
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
|
||||
});
|
||||
|
||||
(block_header, result_chunk.cast())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
eprintln!(
|
||||
"slab dump ({} blocks, {} allocated chunks)",
|
||||
self.num_blocks.load(Ordering::Relaxed),
|
||||
self.num_allocated.load(Ordering::Relaxed)
|
||||
);
|
||||
let lists = self.block_lists.read();
|
||||
|
||||
eprintln!("nonfull blocks:");
|
||||
lists.nonfull_blocks.dump();
|
||||
eprintln!("full blocks:");
|
||||
lists.full_blocks.dump();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use rand::Rng;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
struct TestObject {
|
||||
val: usize,
|
||||
_dummy: [u8; BLOCK_SIZE / 4],
|
||||
}
|
||||
|
||||
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
|
||||
impl<'a> TestObjectSlab<'a> {
|
||||
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
|
||||
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
|
||||
}
|
||||
|
||||
fn alloc(&self, val: usize) -> *mut TestObject {
|
||||
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
|
||||
unsafe { (*obj).val = val };
|
||||
obj
|
||||
}
|
||||
|
||||
fn dealloc(&self, obj: *mut TestObject) {
|
||||
self.0.dealloc_chunk(obj.cast(), &self.1)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slab_alloc() {
|
||||
const MEM_SIZE: usize = 100000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
let block_allocator = BlockAllocator::new(&mut area);
|
||||
|
||||
let slab = TestObjectSlab::new(block_allocator);
|
||||
|
||||
let mut all: Vec<*mut TestObject> = Vec::new();
|
||||
for i in 0..11 {
|
||||
all.push(slab.alloc(i));
|
||||
}
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for i in 0..11 {
|
||||
assert!(unsafe { (*all[i]).val == i });
|
||||
}
|
||||
|
||||
let distribution = Zipf::new(10.0, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..100000 {
|
||||
slab.0.dump();
|
||||
let idx = rng.sample(distribution) as usize;
|
||||
let ptr: *mut TestObject = all[idx];
|
||||
if !ptr.is_null() {
|
||||
assert_eq!(unsafe { (*ptr).val }, idx);
|
||||
slab.dealloc(ptr);
|
||||
all[idx] = std::ptr::null_mut();
|
||||
} else {
|
||||
all[idx] = slab.alloc(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
|
||||
Box::into_raw(Box::new(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
|
||||
num_free_chunks: AtomicU32::new(0),
|
||||
num_chunks: i,
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_linked_list() {
|
||||
// note: these are leaked, but that's OK for tests
|
||||
let a = new_test_blk(0);
|
||||
let b = new_test_blk(1);
|
||||
|
||||
let mut list = BlockList::default();
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(a);
|
||||
assert!(!list.is_empty());
|
||||
list.unlink(a);
|
||||
}
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(b);
|
||||
list.push_head(a);
|
||||
assert_eq!(list.head, a);
|
||||
assert_eq!((*a).next, b);
|
||||
assert_eq!((*b).prev, a);
|
||||
assert_eq!(list.tail, b);
|
||||
|
||||
list.unlink(a);
|
||||
list.unlink(b);
|
||||
assert!(list.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn alloc_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size());
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { result_ptr.as_mut().unwrap() };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
|
||||
pub fn alloc_array_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
len: usize,
|
||||
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() * len > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size() * len);
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
@@ -1,142 +0,0 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_utils::CachePadded;
|
||||
|
||||
const NUM_SLOTS: usize = 1000;
|
||||
|
||||
/// This is the struct that is stored in shmem
|
||||
///
|
||||
/// bit 0: is it pinned or not?
|
||||
/// rest of the bits are the epoch counter.
|
||||
pub struct EpochShared {
|
||||
global_epoch: AtomicU64,
|
||||
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
|
||||
|
||||
broadcast_lock: spin::Mutex<()>,
|
||||
}
|
||||
|
||||
impl EpochShared {
|
||||
pub fn new() -> EpochShared {
|
||||
EpochShared {
|
||||
global_epoch: AtomicU64::new(2),
|
||||
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
|
||||
broadcast_lock: spin::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&self) -> LocalHandle {
|
||||
LocalHandle {
|
||||
global: self,
|
||||
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
|
||||
}
|
||||
}
|
||||
|
||||
fn release_pin(&self, slot: usize, _epoch: u64) {
|
||||
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
self.participants[slot].store(global_epoch, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
|
||||
// pick a slot
|
||||
let mut slot = slot_hint;
|
||||
let epoch = loop {
|
||||
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
|
||||
if old & 1 == 0 {
|
||||
// Got this slot
|
||||
break old;
|
||||
}
|
||||
|
||||
// the slot was busy by another thread / process. try a different slot
|
||||
slot += 1;
|
||||
if slot == NUM_SLOTS {
|
||||
slot = 0;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
(slot, epoch)
|
||||
}
|
||||
|
||||
pub(crate) fn advance(&self) -> u64 {
|
||||
// Advance the global epoch
|
||||
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
|
||||
// Anyone that release their pin after this will update their slot.
|
||||
old_epoch + 2
|
||||
}
|
||||
|
||||
pub(crate) fn broadcast(&self) {
|
||||
let Some(_guard) = self.broadcast_lock.try_lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
let old_epoch = epoch.wrapping_sub(2);
|
||||
|
||||
// Update all free slots.
|
||||
for i in 0..NUM_SLOTS {
|
||||
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
|
||||
let _ = self.participants[i].compare_exchange(
|
||||
old_epoch,
|
||||
epoch,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME: memory fence here, since we used Relaxed?
|
||||
}
|
||||
|
||||
pub(crate) fn get_oldest(&self) -> u64 {
|
||||
// Read all slots.
|
||||
let now = self.global_epoch.load(Ordering::Relaxed);
|
||||
let mut oldest = now;
|
||||
for i in 0..NUM_SLOTS {
|
||||
let this_epoch = self.participants[i].load(Ordering::Relaxed);
|
||||
let delta = now.wrapping_sub(this_epoch);
|
||||
if delta > u64::MAX / 2 {
|
||||
// this is very recent
|
||||
} else if delta > now.wrapping_sub(oldest) {
|
||||
oldest = this_epoch;
|
||||
}
|
||||
}
|
||||
oldest
|
||||
}
|
||||
|
||||
pub(crate) fn get_current(&self) -> u64 {
|
||||
self.global_epoch.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct EpochPin<'e> {
|
||||
slot: usize,
|
||||
pub(crate) epoch: u64,
|
||||
|
||||
handle: &'e LocalHandle<'e>,
|
||||
}
|
||||
|
||||
impl<'e> Drop for EpochPin<'e> {
|
||||
fn drop(&mut self) {
|
||||
self.handle.global.release_pin(self.slot, self.epoch);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalHandle<'g> {
|
||||
global: &'g EpochShared,
|
||||
|
||||
last_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<'g> LocalHandle<'g> {
|
||||
pub fn pin(&self) -> EpochPin {
|
||||
let (slot, epoch) = self
|
||||
.global
|
||||
.pin_internal(self.last_slot.load(Ordering::Relaxed));
|
||||
self.last_slot.store(slot, Ordering::Relaxed);
|
||||
EpochPin {
|
||||
handle: self,
|
||||
epoch,
|
||||
slot,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,583 +0,0 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Single-value leaves.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
pub mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
use algorithm::node_ptr::NodePtr;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use allocator::ArtAllocator;
|
||||
pub use allocator::ArtMultiSlabAllocator;
|
||||
pub use allocator::OutOfMemoryError;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
// fixme obsolete, no longer needs Clone
|
||||
pub trait Value {}
|
||||
|
||||
const MAX_GARBAGE: usize = 1024;
|
||||
|
||||
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
|
||||
pub struct Tree<V: Value> {
|
||||
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
|
||||
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
|
||||
/// indirection to every lookup)
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
epoch: epoch::EpochShared,
|
||||
}
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
|
||||
unsafe impl<V: Value + Send> Send for Tree<V> {}
|
||||
|
||||
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
|
||||
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
|
||||
|
||||
impl<V> GarbageQueue<V> {
|
||||
fn new() -> GarbageQueue<V> {
|
||||
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
|
||||
self.0.push_front((ptr, epoch));
|
||||
}
|
||||
|
||||
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
|
||||
if let Some(back) = self.0.back() {
|
||||
if back.1 < cutoff_epoch {
|
||||
return Some(self.0.pop_back().unwrap().0);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
allocator: &'t A,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
pub allocator: &'t A,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
/// Obsolete nodes that cannot be recycled until their epoch expires.
|
||||
garbage: spin::Mutex<GarbageQueue<V>>,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
|
||||
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
|
||||
let tree_ptr = allocator.alloc_tree();
|
||||
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
|
||||
let init = Tree {
|
||||
root: algorithm::new_root(allocator).expect("out of memory"),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
epoch: epoch::EpochShared::new(),
|
||||
};
|
||||
unsafe { tree_ptr.write(init) };
|
||||
|
||||
TreeInitStruct {
|
||||
tree: unsafe { tree_ptr.as_ref() },
|
||||
allocator,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
garbage: spin::Mutex::new(GarbageQueue::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess {
|
||||
tree: self.tree,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
|
||||
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
|
||||
where
|
||||
't: 'g,
|
||||
{
|
||||
TreeWriteGuard {
|
||||
tree_writer: self,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
created_garbage: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'e, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'e Tree<V>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
|
||||
pub fn get(&'e self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'e, K, V, A>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
created_garbage: bool,
|
||||
}
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
/// Get a value
|
||||
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if existing.is_some() {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
// FIXME: It's not clear if OOM is expected while removing. It seems
|
||||
// not nice, but shrinking a node can OOM. Then again, we could opt
|
||||
// to not shrink a node if we cannot allocate, to live a little longer.
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to remove value and return the old value.
|
||||
pub fn remove_and_return(self, key: &K) -> Option<V>
|
||||
where
|
||||
V: Clone,
|
||||
{
|
||||
let mut old = None;
|
||||
self.update_with_fn(key, |existing| {
|
||||
old = existing.cloned();
|
||||
UpdateAction::Remove
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
old
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
///
|
||||
/// The function is passed a reference to the existing value, if any. If the function
|
||||
/// returns None, the value is removed from the tree (or if there was no existing value,
|
||||
/// does nothing). If the function returns Some, the existing value is replaced, of if there
|
||||
/// was no existing value, it is inserted. FIXME: update comment
|
||||
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
|
||||
|
||||
if self.created_garbage {
|
||||
let _ = self.collect_garbage();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
|
||||
self.tree_writer
|
||||
.garbage
|
||||
.lock()
|
||||
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
|
||||
self.created_garbage = true;
|
||||
}
|
||||
|
||||
// returns number of nodes recycled
|
||||
fn collect_garbage(&self) -> usize {
|
||||
self.tree_writer.tree.epoch.advance();
|
||||
self.tree_writer.tree.epoch.broadcast();
|
||||
|
||||
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
|
||||
|
||||
let mut result = 0;
|
||||
let mut garbage_queue = self.tree_writer.garbage.lock();
|
||||
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
|
||||
ptr.deallocate(self.tree_writer.allocator);
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
done: bool,
|
||||
pub next_key: Vec<u8>,
|
||||
max_key: Option<Vec<u8>>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<K> TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
pub fn new_wrapping() -> TreeIterator<K> {
|
||||
TreeIterator {
|
||||
done: false,
|
||||
next_key: vec![0; K::KEY_LEN],
|
||||
max_key: None,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
|
||||
let result = TreeIterator {
|
||||
done: false,
|
||||
next_key: Vec::from(range.start.as_bytes()),
|
||||
max_key: Some(Vec::from(range.end.as_bytes())),
|
||||
phantom_key: PhantomData,
|
||||
};
|
||||
assert_eq!(result.next_key.len(), K::KEY_LEN);
|
||||
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
|
||||
where
|
||||
V: Value,
|
||||
{
|
||||
if self.done {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut wrapped_around = false;
|
||||
loop {
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
if let Some((k, v)) =
|
||||
algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
|
||||
{
|
||||
assert_eq!(k.len(), K::KEY_LEN);
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
|
||||
// Check if we reached the end of the range
|
||||
if let Some(max_key) = &self.max_key {
|
||||
if k.as_slice() >= max_key.as_slice() {
|
||||
self.done = true;
|
||||
break None;
|
||||
}
|
||||
}
|
||||
|
||||
// increment the key
|
||||
self.next_key = k.clone();
|
||||
increment_key(self.next_key.as_mut_slice());
|
||||
let k = k.as_slice().into();
|
||||
|
||||
break Some((k, v));
|
||||
} else {
|
||||
if self.max_key.is_some() {
|
||||
self.done = true;
|
||||
} else {
|
||||
// Start from beginning
|
||||
if !wrapped_around {
|
||||
for i in 0..K::KEY_LEN {
|
||||
self.next_key[i] = 0;
|
||||
}
|
||||
wrapped_around = true;
|
||||
continue;
|
||||
} else {
|
||||
// The tree is completely empty
|
||||
// FIXME: perhaps we should remember the starting point instead.
|
||||
// Currently this will scan some ranges twice.
|
||||
break None;
|
||||
}
|
||||
}
|
||||
break None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_key(key: &mut [u8]) -> bool {
|
||||
for i in (0..key.len()).rev() {
|
||||
let (byte, overflow) = key[i].overflowing_add(1);
|
||||
key[i] = byte;
|
||||
if !overflow {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// Debugging functions
|
||||
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
|
||||
pub fn get_statistics(&self) -> ArtTreeStatistics {
|
||||
self.allocator.get_statistics();
|
||||
ArtTreeStatistics {
|
||||
blocks: self.allocator.inner.block_allocator.get_statistics(),
|
||||
slabs: self.allocator.get_statistics(),
|
||||
epoch: self.tree.epoch.get_current(),
|
||||
oldest_epoch: self.tree.epoch.get_oldest(),
|
||||
num_garbage: self.garbage.lock().0.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtTreeStatistics {
|
||||
pub blocks: allocator::block::BlockAllocatorStats,
|
||||
pub slabs: allocator::ArtMultiSlabStats,
|
||||
|
||||
pub epoch: u64,
|
||||
pub oldest_epoch: u64,
|
||||
pub num_garbage: u64,
|
||||
}
|
||||
@@ -1,236 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::ArtAllocator;
|
||||
use crate::ArtMultiSlabAllocator;
|
||||
use crate::TreeInitStruct;
|
||||
use crate::TreeIterator;
|
||||
use crate::TreeWriteAccess;
|
||||
use crate::UpdateAction;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::Rng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl TestKey {
|
||||
const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
|
||||
const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
|
||||
}
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let w = tree_writer.start_write();
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx).as_ref());
|
||||
}
|
||||
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for TestValue {}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op<A: ArtAllocator<TestValue>>(
|
||||
op: &TestOp,
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
let w = tree.start_write();
|
||||
w.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
fn test_iter<A: ArtAllocator<TestValue>>(
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
let mut shadow_iter = shadow.iter();
|
||||
let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
|
||||
|
||||
loop {
|
||||
let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
|
||||
let r = tree.start_read();
|
||||
let item = iter.next(&r);
|
||||
|
||||
if shadow_item != item.map(|(k, v)| (k, v.load())) {
|
||||
eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
|
||||
tree.start_read().dump(&mut std::io::stderr());
|
||||
|
||||
eprintln!("SHADOW:");
|
||||
for si in shadow {
|
||||
eprintln!("key: {:?}, val: {}", si.0, si.1);
|
||||
}
|
||||
panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
|
||||
}
|
||||
if item.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let mut key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
if rng.random_bool(0.10) {
|
||||
key = TestKey::from(u128::from(&key) | 0xffffffff);
|
||||
}
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &tree_writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_https_port: Option<u16>,
|
||||
|
||||
pub availability_zone_id: AvailabilityZone,
|
||||
|
||||
// Reachable IP address of the PS/SK registering, if known.
|
||||
// Hadron Cluster Coordiantor will update the DNS record of the registering node
|
||||
// with this IP address.
|
||||
pub node_ip_addr: Option<IpAddr>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantTimelineDescribeResponse {
|
||||
pub shards: Vec<TimelineInfo>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShardResponse {
|
||||
pub node_id: NodeId,
|
||||
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineSafekeeperPeer {
|
||||
pub node_id: NodeId,
|
||||
pub listen_http_addr: String,
|
||||
pub http_port: i32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimeline {
|
||||
// SC does not know the tenant id.
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SCSafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SCSafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimeline {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub peers: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperTimelinesResponse {
|
||||
pub timelines: Vec<SafekeeperTimeline>,
|
||||
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
|
||||
@@ -1622,6 +1622,9 @@ pub struct TimelineInfo {
|
||||
|
||||
/// Whether the timeline is invisible in synthetic size calculations.
|
||||
pub is_invisible: Option<bool>,
|
||||
// HADRON: the largest LSN below which all page updates have been included in the image layers.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -110,7 +110,6 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_type("XLogRecPtr")
|
||||
.allowlist_type("XLogSegNo")
|
||||
.allowlist_type("TimeLineID")
|
||||
.allowlist_type("TimestampTz")
|
||||
.allowlist_type("MultiXactId")
|
||||
.allowlist_type("MultiXactOffset")
|
||||
.allowlist_type("MultiXactStatus")
|
||||
|
||||
@@ -227,8 +227,7 @@ pub mod walrecord;
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::{
|
||||
BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData,
|
||||
RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32,
|
||||
uint64,
|
||||
RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64,
|
||||
};
|
||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||
|
||||
@@ -4,13 +4,14 @@
|
||||
//! TODO: Generate separate types for each supported PG version
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
|
||||
RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
|
||||
RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
|
||||
};
|
||||
|
||||
#[repr(C)]
|
||||
@@ -863,7 +864,8 @@ pub mod v17 {
|
||||
XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
|
||||
rm_neon,
|
||||
};
|
||||
pub use crate::{TimeLineID, TimestampTz};
|
||||
pub use crate::TimeLineID;
|
||||
pub use postgres_ffi_types::TimestampTz;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -9,10 +9,11 @@
|
||||
|
||||
use super::super::waldecoder::WalStreamDecoder;
|
||||
use super::bindings::{
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
MY_PGVERSION
|
||||
};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use super::wal_generator::LogicalMessageGenerator;
|
||||
use crate::pg_constants;
|
||||
use crate::PG_TLI;
|
||||
|
||||
@@ -11,3 +11,4 @@ pub mod forknum;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type RepOriginId = u16;
|
||||
pub type TimestampTz = i64;
|
||||
|
||||
@@ -31,6 +31,7 @@ pub struct UnreliableWrapper {
|
||||
/* BEGIN_HADRON */
|
||||
// This the probability of failure for each operation, ranged from [0, 100].
|
||||
// The probability is default to 100, which means that all operations will fail.
|
||||
// Storage will fail by probability up to attempts_to_fail times.
|
||||
attempt_failure_probability: u64,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ anyhow.workspace = true
|
||||
const_format.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use postgres_versioninfo::PgVersionId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
use crate::membership::Configuration;
|
||||
use crate::membership::{Configuration, SafekeeperGeneration};
|
||||
use crate::{ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
/// Response to a timeline locate request.
|
||||
/// Storcon-only API.
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineLocateResponse {
|
||||
pub generation: SafekeeperGeneration,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
pub new_sk_set: Option<Vec<NodeId>>,
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ where
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub enum DeploymentMode {
|
||||
Local,
|
||||
Dev,
|
||||
Staging,
|
||||
Prod,
|
||||
@@ -64,7 +65,7 @@ pub fn get_deployment_mode() -> Option<DeploymentMode> {
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("DEPLOYMENT_MODE not set");
|
||||
// tracing::error!("DEPLOYMENT_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
73
libs/utils/src/ip_address.rs
Normal file
73
libs/utils/src/ip_address.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::env::{VarError, var};
|
||||
use std::error::Error;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
|
||||
/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
|
||||
/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
|
||||
/// template).
|
||||
pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
|
||||
|
||||
/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
|
||||
/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
|
||||
pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
|
||||
match var(HADRON_NODE_IP_ADDRESS) {
|
||||
Ok(v) => {
|
||||
if let Ok(addr) = IpAddr::from_str(&v) {
|
||||
Ok(Some(addr))
|
||||
} else {
|
||||
Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
|
||||
}
|
||||
}
|
||||
Err(VarError::NotPresent) => Ok(None),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::env;
|
||||
use std::net::{Ipv4Addr, Ipv6Addr};
|
||||
|
||||
#[test]
|
||||
fn test_read_node_ip_addr_from_env() {
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
// Test with a valid IPv4 address
|
||||
env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
|
||||
|
||||
// Test with a valid IPv6 address
|
||||
env::set_var(
|
||||
HADRON_NODE_IP_ADDRESS,
|
||||
"2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
);
|
||||
}
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(IpAddr::V6(
|
||||
Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
|
||||
))
|
||||
);
|
||||
|
||||
// Test with an invalid IP address
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
|
||||
}
|
||||
let result = read_node_ip_addr_from_env();
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test with no environment variable set
|
||||
// SAFETY: test code
|
||||
unsafe {
|
||||
env::remove_var(HADRON_NODE_IP_ADDRESS);
|
||||
}
|
||||
let result = read_node_ip_addr_from_env().unwrap();
|
||||
assert_eq!(result, None);
|
||||
}
|
||||
}
|
||||
@@ -26,6 +26,9 @@ pub mod auth;
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod id;
|
||||
|
||||
// utility functions to obtain reachable IP addresses in PS/SK nodes.
|
||||
pub mod ip_address;
|
||||
|
||||
pub mod shard;
|
||||
|
||||
mod hex;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, VariantNames};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Logs a critical error, similarly to `tracing::error!`. This will:
|
||||
///
|
||||
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
|
||||
///
|
||||
/// TODO: consider upgrading this to a warning, but currently it fires too often.
|
||||
#[inline]
|
||||
pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
|
||||
where
|
||||
F: Future<Output = O>,
|
||||
{
|
||||
pub async fn log_slow<O>(
|
||||
name: &str,
|
||||
threshold: Duration,
|
||||
f: Pin<&mut impl Future<Output = O>>,
|
||||
) -> O {
|
||||
monitor_slow_future(
|
||||
threshold,
|
||||
threshold, // period = threshold
|
||||
@@ -394,16 +396,42 @@ where
|
||||
if !is_slow {
|
||||
return;
|
||||
}
|
||||
let elapsed = elapsed_total.as_secs_f64();
|
||||
if ready {
|
||||
info!(
|
||||
"slow {name} completed after {:.3}s",
|
||||
elapsed_total.as_secs_f64()
|
||||
);
|
||||
info!("slow {name} completed after {elapsed:.3}s");
|
||||
} else {
|
||||
info!(
|
||||
"slow {name} still running after {:.3}s",
|
||||
elapsed_total.as_secs_f64()
|
||||
);
|
||||
info!("slow {name} still running after {elapsed:.3}s");
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Logs a periodic warning if a future is slow to complete.
|
||||
#[inline]
|
||||
pub async fn warn_slow<O>(
|
||||
name: &str,
|
||||
threshold: Duration,
|
||||
f: Pin<&mut impl Future<Output = O>>,
|
||||
) -> O {
|
||||
monitor_slow_future(
|
||||
threshold,
|
||||
threshold, // period = threshold
|
||||
f,
|
||||
|MonitorSlowFutureCallback {
|
||||
ready,
|
||||
is_slow,
|
||||
elapsed_total,
|
||||
elapsed_since_last_callback: _,
|
||||
}| {
|
||||
if !is_slow {
|
||||
return;
|
||||
}
|
||||
let elapsed = elapsed_total.as_secs_f64();
|
||||
if ready {
|
||||
warn!("slow {name} completed after {elapsed:.3}s");
|
||||
} else {
|
||||
warn!("slow {name} still running after {elapsed:.3}s");
|
||||
}
|
||||
},
|
||||
)
|
||||
@@ -416,7 +444,7 @@ where
|
||||
pub async fn monitor_slow_future<F, O>(
|
||||
threshold: Duration,
|
||||
period: Duration,
|
||||
mut fut: std::pin::Pin<&mut F>,
|
||||
mut fut: Pin<&mut F>,
|
||||
mut cb: impl FnMut(MonitorSlowFutureCallback),
|
||||
) -> O
|
||||
where
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
|
||||
@@ -54,7 +54,6 @@ pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
|
||||
pageserver_compaction.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
peekable.workspace = true
|
||||
pem.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
@@ -67,7 +66,6 @@ postgres-types.workspace = true
|
||||
posthog_client_lite.workspace = true
|
||||
pprof.workspace = true
|
||||
pq_proto.workspace = true
|
||||
prost.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZero;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::anyhow;
|
||||
use arc_swap::ArcSwap;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tracing::instrument;
|
||||
use tracing::{debug, instrument};
|
||||
use utils::logging::warn_slow;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
@@ -21,28 +24,40 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
|
||||
/// when full.
|
||||
///
|
||||
/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
|
||||
/// streams per connection.
|
||||
///
|
||||
/// TODO: tune all of these constants, and consider making them configurable.
|
||||
/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
|
||||
/// with only streams.
|
||||
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of concurrent unary request clients per shard.
|
||||
const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
|
||||
/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
|
||||
/// transmission delays. This also concentrates large window sizes on a smaller set of
|
||||
/// streams/connections, presumably reducing memory use.
|
||||
const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
|
||||
/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
|
||||
const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
/// The batch size threshold at which a GetPage request will use the bulk stream pool.
|
||||
///
|
||||
/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
|
||||
/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
|
||||
const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
|
||||
|
||||
/// Max number of pipelined requests per stream.
|
||||
const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
|
||||
/// The overall request call timeout, including retries and pool acquisition.
|
||||
/// TODO: should we retry forever? Should the caller decide?
|
||||
const CALL_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
|
||||
/// are more throughput-oriented, we have a smaller limit but higher queue depth.
|
||||
const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
/// The per-request (retry attempt) timeout, including any lazy connection establishment.
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
|
||||
/// get a larger queue depth.
|
||||
const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
|
||||
/// The initial request retry backoff duration. The first retry does not back off.
|
||||
/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
|
||||
const BASE_BACKOFF: Duration = Duration::from_millis(5);
|
||||
|
||||
/// The maximum request retry backoff duration.
|
||||
const MAX_BACKOFF: Duration = Duration::from_secs(5);
|
||||
|
||||
/// Threshold and interval for warning about slow operation.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
|
||||
|
||||
/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
|
||||
/// basic `page_api::Client` gRPC client, and supports:
|
||||
@@ -50,10 +65,19 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
|
||||
/// * Sharded tenants across multiple Pageservers.
|
||||
/// * Pooling of connections, clients, and streams for efficient resource use.
|
||||
/// * Concurrent use by many callers.
|
||||
/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
|
||||
/// * Internal handling of GetPage bidirectional streams.
|
||||
/// * Automatic retries.
|
||||
/// * Observability.
|
||||
///
|
||||
/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
|
||||
/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
|
||||
/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
|
||||
/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
|
||||
/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
|
||||
/// one stream per backend, but without the TCP connection overhead. In the common case we expect
|
||||
/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
|
||||
/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
|
||||
///
|
||||
/// TODO: this client does not support base backups or LSN leases, as these are only used by
|
||||
/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
|
||||
pub struct PageserverClient {
|
||||
@@ -67,8 +91,6 @@ pub struct PageserverClient {
|
||||
compression: Option<CompressionEncoding>,
|
||||
/// The shards for this tenant.
|
||||
shards: ArcSwap<Shards>,
|
||||
/// The retry configuration.
|
||||
retry: Retry,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
@@ -94,7 +116,6 @@ impl PageserverClient {
|
||||
auth_token,
|
||||
compression,
|
||||
shards: ArcSwap::new(Arc::new(shards)),
|
||||
retry: Retry,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -142,13 +163,15 @@ impl PageserverClient {
|
||||
&self,
|
||||
req: page_api::CheckRelExistsRequest,
|
||||
) -> tonic::Result<page_api::CheckRelExistsResponse> {
|
||||
self.retry
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.check_rel_exists(req).await
|
||||
})
|
||||
.await
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
@@ -157,13 +180,15 @@ impl PageserverClient {
|
||||
&self,
|
||||
req: page_api::GetDbSizeRequest,
|
||||
) -> tonic::Result<page_api::GetDbSizeResponse> {
|
||||
self.retry
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_db_size(req).await
|
||||
})
|
||||
.await
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
|
||||
@@ -193,6 +218,8 @@ impl PageserverClient {
|
||||
return Err(tonic::Status::invalid_argument("request attempt must be 0"));
|
||||
}
|
||||
|
||||
debug!("sending request: {req:?}");
|
||||
|
||||
// The shards may change while we're fetching pages. We execute the request using a stable
|
||||
// view of the shards (especially important for requests that span shards), but retry the
|
||||
// top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
|
||||
@@ -201,13 +228,16 @@ impl PageserverClient {
|
||||
//
|
||||
// TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
|
||||
// once we figure out how to handle these.
|
||||
self.retry
|
||||
.with(async |attempt| {
|
||||
let mut req = req.clone();
|
||||
req.request_id.attempt = attempt as u32;
|
||||
Self::get_page_with_shards(req, &self.shards.load_full()).await
|
||||
})
|
||||
.await
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
|
||||
let mut req = req.clone();
|
||||
req.request_id.attempt = attempt as u32;
|
||||
let shards = self.shards.load_full();
|
||||
Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
|
||||
})
|
||||
.await?;
|
||||
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
|
||||
@@ -246,7 +276,7 @@ impl PageserverClient {
|
||||
req: page_api::GetPageRequest,
|
||||
shard: &Shard,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let stream = shard.stream(req.request_class.is_bulk()).await;
|
||||
let mut stream = shard.stream(Self::is_bulk(&req)).await?;
|
||||
let resp = stream.send(req.clone()).await?;
|
||||
|
||||
// Convert per-request errors into a tonic::Status.
|
||||
@@ -290,13 +320,15 @@ impl PageserverClient {
|
||||
&self,
|
||||
req: page_api::GetRelSizeRequest,
|
||||
) -> tonic::Result<page_api::GetRelSizeResponse> {
|
||||
self.retry
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_rel_size(req).await
|
||||
})
|
||||
.await
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
@@ -305,13 +337,50 @@ impl PageserverClient {
|
||||
&self,
|
||||
req: page_api::GetSlruSegmentRequest,
|
||||
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
|
||||
self.retry
|
||||
.with(async |_| {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_slru_segment(req).await
|
||||
})
|
||||
.await
|
||||
debug!("sending request: {req:?}");
|
||||
let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
|
||||
})
|
||||
.await?;
|
||||
debug!("received response: {resp:?}");
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
|
||||
/// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
|
||||
async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut(usize) -> O, // pass attempt number, starting at 0
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
Retry {
|
||||
timeout: Some(timeout),
|
||||
base_backoff: BASE_BACKOFF,
|
||||
max_backoff: MAX_BACKOFF,
|
||||
}
|
||||
.with(f)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
|
||||
async fn with_timeout<T>(
|
||||
timeout: Duration,
|
||||
f: impl Future<Output = tonic::Result<T>>,
|
||||
) -> tonic::Result<T> {
|
||||
let started = Instant::now();
|
||||
tokio::time::timeout(timeout, f).await.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
started.elapsed().as_secs_f64()
|
||||
))
|
||||
})?
|
||||
}
|
||||
|
||||
/// Returns true if the request is considered a bulk request and should use the bulk pool.
|
||||
fn is_bulk(req: &page_api::GetPageRequest) -> bool {
|
||||
req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,15 +509,23 @@ impl Shards {
|
||||
}
|
||||
}
|
||||
|
||||
/// A single shard. Uses dedicated resource pools with the following structure:
|
||||
/// A single shard. Has dedicated resource pools with the following structure:
|
||||
///
|
||||
/// * Channel pool: unbounded.
|
||||
/// * Unary client pool: MAX_UNARY_CLIENTS.
|
||||
/// * Stream client pool: unbounded.
|
||||
/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
|
||||
/// * Bulk channel pool: unbounded.
|
||||
/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
|
||||
/// * Client pool: unbounded.
|
||||
/// * Stream pool: unbounded.
|
||||
/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
|
||||
/// * Bulk client pool: unbounded.
|
||||
/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
|
||||
/// * Bulk stream pool: unbounded.
|
||||
///
|
||||
/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
|
||||
/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
|
||||
/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
|
||||
/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
|
||||
/// similar (except for TCP transmission time).
|
||||
///
|
||||
/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
|
||||
/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
|
||||
struct Shard {
|
||||
/// The shard ID.
|
||||
id: ShardIndex,
|
||||
@@ -456,7 +533,7 @@ struct Shard {
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// GetPage stream pool.
|
||||
stream_pool: Arc<StreamPool>,
|
||||
/// GetPage stream pool for bulk requests, e.g. prefetches.
|
||||
/// GetPage stream pool for bulk requests.
|
||||
bulk_stream_pool: Arc<StreamPool>,
|
||||
}
|
||||
|
||||
@@ -470,50 +547,30 @@ impl Shard {
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Common channel pool for unary and stream requests. Bounded by client/stream pools.
|
||||
let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
|
||||
|
||||
// Client pool for unary requests.
|
||||
// Shard pools for unary requests and non-bulk GetPage requests.
|
||||
let client_pool = ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
Some(MAX_UNARY_CLIENTS),
|
||||
None, // unbounded
|
||||
);
|
||||
let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
|
||||
|
||||
// GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
|
||||
// but shares a channel pool with it (as it's unbounded).
|
||||
let stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_STREAMS),
|
||||
MAX_STREAM_QUEUE_DEPTH,
|
||||
);
|
||||
|
||||
// Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
|
||||
// to avoid head-of-line blocking of latency-sensitive requests.
|
||||
// Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
|
||||
let bulk_stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
|
||||
ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
None, // unbounded, limited by stream pool
|
||||
None, // unbounded,
|
||||
),
|
||||
Some(MAX_BULK_STREAMS),
|
||||
MAX_BULK_STREAM_QUEUE_DEPTH,
|
||||
None, // unbounded
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
@@ -525,19 +582,23 @@ impl Shard {
|
||||
}
|
||||
|
||||
/// Returns a pooled client for this shard.
|
||||
#[instrument(skip_all)]
|
||||
async fn client(&self) -> tonic::Result<ClientGuard> {
|
||||
self.client_pool
|
||||
.get()
|
||||
.await
|
||||
.map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
|
||||
warn_slow(
|
||||
"client pool acquisition",
|
||||
SLOW_THRESHOLD,
|
||||
pin!(self.client_pool.get()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
|
||||
/// pool (e.g. for prefetches).
|
||||
async fn stream(&self, bulk: bool) -> StreamGuard {
|
||||
match bulk {
|
||||
false => self.stream_pool.get().await,
|
||||
true => self.bulk_stream_pool.get().await,
|
||||
}
|
||||
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
|
||||
#[instrument(skip_all, fields(bulk))]
|
||||
async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
|
||||
let pool = match bulk {
|
||||
false => &self.stream_pool,
|
||||
true => &self.bulk_stream_pool,
|
||||
};
|
||||
warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,4 +4,3 @@ mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec
|
||||
|
||||
@@ -9,19 +9,36 @@
|
||||
//!
|
||||
//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
|
||||
//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
|
||||
//! per-channel client limit. Channels may be closed when they are no longer used by any clients.
|
||||
//! per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
|
||||
//! client/stream idle timeouts.
|
||||
//!
|
||||
//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
|
||||
//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
|
||||
//! from the pool after some time, to free up the channel.
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients are removed
|
||||
//! from the pool after a while to free up resources.
|
||||
//!
|
||||
//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
|
||||
//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
|
||||
//! returns a guard that can be used to send a single request, to properly enforce queue depth and
|
||||
//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
|
||||
//! possibly pipelining multiple requests from multiple callers on the same stream (up to some
|
||||
//! queue depth). Idle streams may be removed from the pool after a while to free up the client.
|
||||
//! ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
|
||||
//! time, and is returned to the pool when dropped. Idle streams are removed from the pool after
|
||||
//! a while to free up resources.
|
||||
//!
|
||||
//! The stream only supports sending a single, synchronous request at a time, and does not support
|
||||
//! pipelining multiple requests from different callers onto the same stream -- instead, we scale
|
||||
//! out concurrent streams to improve throughput. There are many reasons for this design choice:
|
||||
//!
|
||||
//! * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
|
||||
//! a single server task, which may block e.g. on layer downloads, LSN waits, etc.
|
||||
//!
|
||||
//! * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
|
||||
//! (e.g. because of a timeout), the request would still be processed by the server and block
|
||||
//! requests behind it in the stream. It might even block its own timeout retry.
|
||||
//!
|
||||
//! * Stream scheduling becomes significantly simpler and cheaper.
|
||||
//!
|
||||
//! * Individual callers can still use client-side batching for pipelining.
|
||||
//!
|
||||
//! * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
|
||||
//! per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
|
||||
//!
|
||||
//! Each channel corresponds to one TCP connection. Each client unary request and each stream
|
||||
//! corresponds to one HTTP/2 stream and server task.
|
||||
@@ -29,33 +46,31 @@
|
||||
//! TODO: error handling (including custom error types).
|
||||
//! TODO: observability.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::collections::BTreeMap;
|
||||
use std::num::NonZero;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use futures::StreamExt as _;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
|
||||
use futures::{Stream, StreamExt as _};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
|
||||
use tokio_stream::wrappers::WatchStream;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tracing::{error, warn};
|
||||
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// Reap channels/clients/streams that have been idle for this long.
|
||||
/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
|
||||
/// empty, and indirectly rely on the client/stream idle timeouts.
|
||||
///
|
||||
/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
|
||||
/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
|
||||
/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
|
||||
/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
|
||||
/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
|
||||
/// channels, and/or stream pool clients.
|
||||
/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
|
||||
/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
|
||||
/// keep its client around in the pool for a while.
|
||||
const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
|
||||
false => Duration::from_secs(180),
|
||||
true => Duration::from_secs(1), // exercise reaping in tests
|
||||
@@ -83,8 +98,6 @@ pub struct ChannelPool {
|
||||
max_clients_per_channel: NonZero<usize>,
|
||||
/// Open channels.
|
||||
channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
|
||||
/// Reaps idle channels.
|
||||
idle_reaper: Reaper,
|
||||
/// Channel ID generator.
|
||||
next_channel_id: AtomicUsize,
|
||||
}
|
||||
@@ -96,9 +109,6 @@ struct ChannelEntry {
|
||||
channel: Channel,
|
||||
/// Number of clients using this channel.
|
||||
clients: usize,
|
||||
/// The channel has been idle (no clients) since this time. None if channel is in use.
|
||||
/// INVARIANT: Some if clients == 0, otherwise None.
|
||||
idle_since: Option<Instant>,
|
||||
}
|
||||
|
||||
impl ChannelPool {
|
||||
@@ -108,15 +118,12 @@ impl ChannelPool {
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
let pool = Arc::new(Self {
|
||||
Ok(Arc::new(Self {
|
||||
endpoint: endpoint.try_into()?,
|
||||
max_clients_per_channel,
|
||||
channels: Mutex::default(),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
next_channel_id: AtomicUsize::default(),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
Ok(pool)
|
||||
}))
|
||||
}
|
||||
|
||||
/// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
|
||||
@@ -137,22 +144,17 @@ impl ChannelPool {
|
||||
let mut channels = self.channels.lock().unwrap();
|
||||
|
||||
// Try to find an existing channel with available capacity. We check entries in BTreeMap
|
||||
// order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
|
||||
// with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// order, to fill up the lower-ordered channels first. The client/stream pools also prefer
|
||||
// clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// channels, and free up higher-ordered channels such that they can be reaped.
|
||||
for (&id, entry) in channels.iter_mut() {
|
||||
assert!(
|
||||
entry.clients <= self.max_clients_per_channel.get(),
|
||||
"channel overflow"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.idle_since.is_some(),
|
||||
entry.clients == 0,
|
||||
"incorrect channel idle state"
|
||||
);
|
||||
assert_ne!(entry.clients, 0, "empty channel not reaped");
|
||||
if entry.clients < self.max_clients_per_channel.get() {
|
||||
entry.clients += 1;
|
||||
entry.idle_since = None;
|
||||
return ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
@@ -169,7 +171,6 @@ impl ChannelPool {
|
||||
let entry = ChannelEntry {
|
||||
channel: channel.clone(),
|
||||
clients: 1, // account for the guard below
|
||||
idle_since: None,
|
||||
};
|
||||
channels.insert(id, entry);
|
||||
|
||||
@@ -181,20 +182,6 @@ impl ChannelPool {
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for ChannelPool {
|
||||
/// Reaps channels that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.channels.lock().unwrap().retain(|_, entry| {
|
||||
let Some(idle_since) = entry.idle_since else {
|
||||
assert_ne!(entry.clients, 0, "empty channel not marked idle");
|
||||
return true;
|
||||
};
|
||||
assert_eq!(entry.clients, 0, "idle channel has clients");
|
||||
idle_since >= cutoff
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
|
||||
/// since the gRPC client requires an owned `Channel`.
|
||||
pub struct ChannelGuard {
|
||||
@@ -211,7 +198,7 @@ impl ChannelGuard {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the channel to the pool.
|
||||
/// Returns the channel to the pool. The channel is closed when empty.
|
||||
impl Drop for ChannelGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
@@ -220,11 +207,12 @@ impl Drop for ChannelGuard {
|
||||
|
||||
let mut channels = pool.channels.lock().unwrap();
|
||||
let entry = channels.get_mut(&self.id).expect("unknown channel");
|
||||
assert!(entry.idle_since.is_none(), "active channel marked idle");
|
||||
assert!(entry.clients > 0, "channel underflow");
|
||||
entry.clients -= 1;
|
||||
|
||||
// Reap empty channels immediately.
|
||||
if entry.clients == 0 {
|
||||
entry.idle_since = Some(Instant::now()); // mark channel as idle
|
||||
channels.remove(&self.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -253,8 +241,7 @@ pub struct ClientPool {
|
||||
///
|
||||
/// The first client in the map will be acquired next. The map is sorted by client ID, which in
|
||||
/// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
|
||||
/// clients are reaped.
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
|
||||
idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
|
||||
/// Reaps idle clients.
|
||||
idle_reaper: Reaper,
|
||||
@@ -310,7 +297,7 @@ impl ClientPool {
|
||||
/// This is moderately performance-sensitive. It is called for every unary request, but these
|
||||
/// establish a new gRPC stream per request so they're already expensive. GetPage requests use
|
||||
/// the `StreamPool` instead.
|
||||
pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
|
||||
pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
@@ -328,7 +315,7 @@ impl ClientPool {
|
||||
});
|
||||
}
|
||||
|
||||
// Slow path: construct a new client.
|
||||
// Construct a new client.
|
||||
let mut channel_guard = self.channel_pool.get();
|
||||
let client = page_api::Client::new(
|
||||
channel_guard.take(),
|
||||
@@ -337,7 +324,8 @@ impl ClientPool {
|
||||
self.shard_id,
|
||||
self.auth_token.clone(),
|
||||
self.compression,
|
||||
)?;
|
||||
)
|
||||
.map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
|
||||
|
||||
Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
@@ -407,287 +395,187 @@ impl Drop for ClientGuard {
|
||||
/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
|
||||
/// acquires a client from the inner `ClientPool` for the stream's lifetime.
|
||||
///
|
||||
/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
|
||||
/// a single request and await the response. Internally, requests are multiplexed across streams and
|
||||
/// channels. This allows proper queue depth enforcement and response routing.
|
||||
/// Individual streams only send a single request at a time, and do not pipeline multiple callers
|
||||
/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
|
||||
/// to eliminate head-of-line blocking. See the module documentation for more details.
|
||||
///
|
||||
/// TODO: consider making this generic over request and response types; not currently needed.
|
||||
pub struct StreamPool {
|
||||
/// The client pool to acquire clients from. Must be unbounded.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// All pooled streams.
|
||||
/// Idle pooled streams. Acquired streams are removed from here and returned on drop.
|
||||
///
|
||||
/// Incoming requests will be sent over an existing stream with available capacity. If all
|
||||
/// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
|
||||
/// stream has an associated Tokio task that processes requests and responses.
|
||||
streams: Mutex<HashMap<StreamID, StreamEntry>>,
|
||||
/// The max number of concurrent streams, or None if unbounded.
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
/// The max number of concurrent requests per stream.
|
||||
max_queue_depth: NonZero<usize>,
|
||||
/// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
|
||||
/// None if the pool is unbounded.
|
||||
/// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
|
||||
/// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
|
||||
/// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
|
||||
/// higher-ordered channels.
|
||||
idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
|
||||
/// Limits the max number of concurrent streams. None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Reaps idle streams.
|
||||
idle_reaper: Reaper,
|
||||
/// Stream ID generator.
|
||||
next_stream_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type StreamID = usize;
|
||||
type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
|
||||
/// The stream ID. Reuses the inner client ID.
|
||||
type StreamID = ClientID;
|
||||
|
||||
/// A pooled stream.
|
||||
struct StreamEntry {
|
||||
/// Sends caller requests to the stream task. The stream task exits when this is dropped.
|
||||
sender: RequestSender,
|
||||
/// Number of in-flight requests on this stream.
|
||||
queue_depth: usize,
|
||||
/// The time when this stream went idle (queue_depth == 0).
|
||||
/// INVARIANT: Some if queue_depth == 0, otherwise None.
|
||||
idle_since: Option<Instant>,
|
||||
/// The bidirectional stream.
|
||||
stream: BiStream,
|
||||
/// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
|
||||
idle_since: Instant,
|
||||
}
|
||||
|
||||
/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
|
||||
struct BiStream {
|
||||
/// The owning client. Holds onto the channel slot while the stream is alive.
|
||||
client: ClientGuard,
|
||||
/// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
|
||||
/// time, and the caller must await the response before sending another request. This is
|
||||
/// enforced by `StreamGuard::send`.
|
||||
sender: watch::Sender<page_api::GetPageRequest>,
|
||||
/// Stream for receiving responses.
|
||||
receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
|
||||
}
|
||||
|
||||
impl StreamPool {
|
||||
/// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
|
||||
/// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
|
||||
/// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
|
||||
/// concurrent streams.
|
||||
///
|
||||
/// The client pool must be unbounded. The stream pool will enforce its own limits, and because
|
||||
/// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
|
||||
/// The stream pool should generally have its own dedicated client pool (but it can share a
|
||||
/// channel pool with others since these are always unbounded).
|
||||
pub fn new(
|
||||
client_pool: Arc<ClientPool>,
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
max_queue_depth: NonZero<usize>,
|
||||
) -> Arc<Self> {
|
||||
pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
|
||||
assert!(client_pool.limiter.is_none(), "bounded client pool");
|
||||
let pool = Arc::new(Self {
|
||||
client_pool,
|
||||
streams: Mutex::default(),
|
||||
limiter: max_streams.map(|max_streams| {
|
||||
Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
|
||||
}),
|
||||
max_streams,
|
||||
max_queue_depth,
|
||||
idle: Mutex::default(),
|
||||
limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
next_stream_id: AtomicUsize::default(),
|
||||
});
|
||||
pool.idle_reaper.spawn(&pool);
|
||||
pool
|
||||
}
|
||||
|
||||
/// Acquires an available stream from the pool, or spins up a new stream async if all streams
|
||||
/// are full. Returns a guard that can be used to send a single request on the stream and await
|
||||
/// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
|
||||
/// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
|
||||
/// Acquires an available stream from the pool, or spins up a new stream if all streams are
|
||||
/// full. Returns a guard that can be used to send requests and await the responses. Blocks if
|
||||
/// the pool is full.
|
||||
///
|
||||
/// This is very performance-sensitive, as it is on the GetPage hot path.
|
||||
///
|
||||
/// TODO: this must do something more sophisticated for performance. We want:
|
||||
///
|
||||
/// * Cheap, concurrent access in the common case where we can use a pooled stream.
|
||||
/// * Quick acquisition of pooled streams with available capacity.
|
||||
/// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
|
||||
/// * Prefer filling up existing streams' queue depth before spinning up new streams.
|
||||
/// * Don't hold a lock while spinning up new streams.
|
||||
/// * Allow concurrent clients to join onto streams while they're spun up.
|
||||
/// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
|
||||
///
|
||||
/// For now, we just do something simple but inefficient (linear scan under mutex).
|
||||
pub async fn get(self: &Arc<Self>) -> StreamGuard {
|
||||
/// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
|
||||
/// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
|
||||
/// to free up higher-ordered channels.
|
||||
pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
let mut streams = self.streams.lock().unwrap();
|
||||
|
||||
// Look for a pooled stream with available capacity.
|
||||
for (&id, entry) in streams.iter_mut() {
|
||||
assert!(
|
||||
entry.queue_depth <= self.max_queue_depth.get(),
|
||||
"stream queue overflow"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.idle_since.is_some(),
|
||||
entry.queue_depth == 0,
|
||||
"incorrect stream idle state"
|
||||
);
|
||||
if entry.queue_depth < self.max_queue_depth.get() {
|
||||
entry.queue_depth += 1;
|
||||
entry.idle_since = None;
|
||||
return StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
sender: entry.sender.clone(),
|
||||
permit,
|
||||
};
|
||||
}
|
||||
// Fast path: acquire an idle stream from the pool.
|
||||
if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
|
||||
return Ok(StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
stream: Some(entry.stream),
|
||||
can_reuse: true,
|
||||
permit,
|
||||
});
|
||||
}
|
||||
|
||||
// No available stream, spin up a new one. We install the stream entry in the pool first and
|
||||
// return the guard, while spinning up the stream task async. This allows other callers to
|
||||
// join onto this stream and also create additional streams concurrently if this fills up.
|
||||
let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
|
||||
let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
|
||||
let entry = StreamEntry {
|
||||
sender: req_tx.clone(),
|
||||
queue_depth: 1, // reserve quota for this caller
|
||||
idle_since: None,
|
||||
};
|
||||
streams.insert(id, entry);
|
||||
// Spin up a new stream. Uses a watch channel to send a single request at a time, since
|
||||
// `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
|
||||
let mut client = self.client_pool.get().await?;
|
||||
|
||||
if let Some(max_streams) = self.max_streams {
|
||||
assert!(streams.len() <= max_streams.get(), "stream overflow");
|
||||
};
|
||||
let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
|
||||
let req_stream = WatchStream::from_changes(req_rx);
|
||||
let resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
let client_pool = self.client_pool.clone();
|
||||
let pool = Arc::downgrade(self);
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = Self::run_stream(client_pool, req_rx).await {
|
||||
error!("stream failed: {err}");
|
||||
}
|
||||
// Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
|
||||
if let Some(pool) = pool.upgrade() {
|
||||
let entry = pool.streams.lock().unwrap().remove(&id);
|
||||
assert!(entry.is_some(), "unknown stream ID: {id}");
|
||||
}
|
||||
});
|
||||
|
||||
StreamGuard {
|
||||
Ok(StreamGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
sender: req_tx,
|
||||
stream: Some(BiStream {
|
||||
client,
|
||||
sender: req_tx,
|
||||
receiver: Box::pin(resp_stream),
|
||||
}),
|
||||
can_reuse: true,
|
||||
permit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
|
||||
/// bidirectional GetPage stream, then forwards requests and responses between callers and the
|
||||
/// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
|
||||
/// atomic with pool stream acquisition.
|
||||
///
|
||||
/// The task exits when the request channel is closed, or on a stream error. The caller is
|
||||
/// responsible for removing the stream from the pool on exit.
|
||||
async fn run_stream(
|
||||
client_pool: Arc<ClientPool>,
|
||||
mut caller_rx: RequestReceiver,
|
||||
) -> anyhow::Result<()> {
|
||||
// Acquire a client from the pool and create a stream.
|
||||
let mut client = client_pool.get().await?;
|
||||
|
||||
// NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
|
||||
// theoretically deadlock if both the client and server block on sends (since we're not
|
||||
// reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
|
||||
// low queue depths, but it was seen to happen with the libpq protocol so better safe than
|
||||
// sorry. It should never buffer more than the queue depth anyway, but using an unbounded
|
||||
// channel guarantees that it will never block.
|
||||
let (req_tx, req_rx) = mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Track caller response channels by request ID. If the task returns early, these response
|
||||
// channels will be dropped and the waiting callers will receive an error.
|
||||
//
|
||||
// NB: this will leak entries if the server doesn't respond to a request (by request ID).
|
||||
// It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
|
||||
// block further use. But we could consider reaping closed channels after some time.
|
||||
let mut callers = HashMap::new();
|
||||
|
||||
// Process requests and responses.
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Receive requests from callers and send them to the stream.
|
||||
req = caller_rx.recv() => {
|
||||
// Shut down if request channel is closed.
|
||||
let Some((req, resp_tx)) = req else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Store the response channel by request ID.
|
||||
if callers.contains_key(&req.request_id) {
|
||||
// Error on request ID duplicates. Ignore callers that went away.
|
||||
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
|
||||
format!("duplicate request ID: {}", req.request_id),
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
callers.insert(req.request_id, resp_tx);
|
||||
|
||||
// Send the request on the stream. Bail out if the stream is closed.
|
||||
req_tx.send(req).map_err(|_| {
|
||||
tonic::Status::unavailable("stream closed")
|
||||
})?;
|
||||
}
|
||||
|
||||
// Receive responses from the stream and send them to callers.
|
||||
resp = resp_stream.next() => {
|
||||
// Shut down if the stream is closed, and bail out on stream errors.
|
||||
let Some(resp) = resp.transpose()? else {
|
||||
return Ok(())
|
||||
};
|
||||
|
||||
// Send the response to the caller. Ignore errors if the caller went away.
|
||||
let Some(resp_tx) = callers.remove(&resp.request_id) else {
|
||||
warn!("received response for unknown request ID: {}", resp.request_id);
|
||||
continue;
|
||||
};
|
||||
_ = resp_tx.send(Ok(resp));
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Reapable for StreamPool {
|
||||
/// Reaps streams that have been idle since before the cutoff.
|
||||
fn reap_idle(&self, cutoff: Instant) {
|
||||
self.streams.lock().unwrap().retain(|_, entry| {
|
||||
let Some(idle_since) = entry.idle_since else {
|
||||
assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
|
||||
return true;
|
||||
};
|
||||
assert_eq!(entry.queue_depth, 0, "idle stream has requests");
|
||||
idle_since >= cutoff
|
||||
});
|
||||
self.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.retain(|_, entry| entry.idle_since >= cutoff);
|
||||
}
|
||||
}
|
||||
|
||||
/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
|
||||
/// depth. Queue depth is already reserved and will be returned on drop.
|
||||
/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
|
||||
/// in-flight requests on the stream, or the stream failed.
|
||||
pub struct StreamGuard {
|
||||
pool: Weak<StreamPool>,
|
||||
id: StreamID,
|
||||
sender: RequestSender,
|
||||
stream: Option<BiStream>, // Some until dropped
|
||||
can_reuse: bool, // returned to pool if true
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl StreamGuard {
|
||||
/// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
|
||||
/// valid for a single request (to enforce queue depth). This also drops the guard on return and
|
||||
/// returns the queue depth quota to the pool.
|
||||
/// Sends a request on the stream and awaits the response. If the future is dropped before it
|
||||
/// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
|
||||
/// request and is not returned to the pool. The same is true if the stream errors, in which
|
||||
/// case the caller can't send further requests on the stream.
|
||||
///
|
||||
/// The `GetPageRequest::request_id` must be unique across in-flight requests.
|
||||
/// We only support sending a single request at a time, to eliminate head-of-line blocking. See
|
||||
/// module documentation for details.
|
||||
///
|
||||
/// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
|
||||
/// to avoid tearing down the stream for per-request errors. Callers must check this.
|
||||
pub async fn send(
|
||||
self,
|
||||
&mut self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let (resp_tx, resp_rx) = oneshot::channel();
|
||||
let req_id = req.request_id;
|
||||
let stream = self.stream.as_mut().expect("not dropped");
|
||||
|
||||
self.sender
|
||||
.send((req, resp_tx))
|
||||
.await
|
||||
// Mark the stream as not reusable while the request is in flight. We can't return the
|
||||
// stream to the pool until we receive the response, to avoid head-of-line blocking and
|
||||
// stale responses. Failed streams can't be reused either.
|
||||
if !self.can_reuse {
|
||||
return Err(tonic::Status::internal("stream can't be reused"));
|
||||
}
|
||||
self.can_reuse = false;
|
||||
|
||||
// Send the request and receive the response.
|
||||
//
|
||||
// NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
|
||||
stream
|
||||
.sender
|
||||
.send(req)
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?;
|
||||
|
||||
resp_rx
|
||||
let resp = stream
|
||||
.receiver
|
||||
.next()
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?
|
||||
.ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
|
||||
|
||||
if resp.request_id != req_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID {} does not match request ID {}",
|
||||
resp.request_id, req_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Success, mark the stream as reusable.
|
||||
self.can_reuse = true;
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -697,26 +585,21 @@ impl Drop for StreamGuard {
|
||||
return; // pool was dropped
|
||||
};
|
||||
|
||||
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
|
||||
// before the response is received, but that's okay.
|
||||
//
|
||||
// TODO: actually, it's probably not okay. Queue depth release should be moved into the
|
||||
// stream task, such that it continues to account for the queue depth slot until the server
|
||||
// responds. Otherwise, if a slow request times out and keeps blocking the stream, the
|
||||
// server will keep waiting on it and we can pile on subsequent requests (including the
|
||||
// timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
|
||||
// requests on e.g. LSN waits and layer downloads, instead returning early to free up the
|
||||
// stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
|
||||
// blocking. TBD.
|
||||
let mut streams = pool.streams.lock().unwrap();
|
||||
let entry = streams.get_mut(&self.id).expect("unknown stream");
|
||||
assert!(entry.idle_since.is_none(), "active stream marked idle");
|
||||
assert!(entry.queue_depth > 0, "stream queue underflow");
|
||||
entry.queue_depth -= 1;
|
||||
if entry.queue_depth == 0 {
|
||||
entry.idle_since = Some(Instant::now()); // mark stream as idle
|
||||
// If the stream isn't reusable, it can't be returned to the pool.
|
||||
if !self.can_reuse {
|
||||
return;
|
||||
}
|
||||
|
||||
// Place the idle stream back into the pool.
|
||||
let entry = StreamEntry {
|
||||
stream: self.stream.take().expect("dropped once"),
|
||||
idle_since: Instant::now(),
|
||||
};
|
||||
pool.idle
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(entry.stream.client.id, entry);
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::future::pending;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
|
||||
/// A retry handler for Pageserver gRPC requests.
|
||||
///
|
||||
/// This is used instead of backoff::retry for better control and observability.
|
||||
pub struct Retry;
|
||||
pub struct Retry {
|
||||
/// Timeout across all retry attempts. If None, retries forever.
|
||||
pub timeout: Option<Duration>,
|
||||
/// The initial backoff duration. The first retry does not use a backoff.
|
||||
pub base_backoff: Duration,
|
||||
/// The maximum backoff duration.
|
||||
pub max_backoff: Duration,
|
||||
}
|
||||
|
||||
impl Retry {
|
||||
/// The per-request timeout.
|
||||
// TODO: tune these, and/or make them configurable. Should we retry forever?
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
/// The total timeout across all attempts
|
||||
const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// The initial backoff duration.
|
||||
const BASE_BACKOFF: Duration = Duration::from_millis(10);
|
||||
/// The maximum backoff duration.
|
||||
const MAX_BACKOFF: Duration = Duration::from_secs(10);
|
||||
/// If true, log successful requests. For debugging.
|
||||
const LOG_SUCCESS: bool = false;
|
||||
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff), passing the
|
||||
/// attempt number starting at 0. Logs errors, using the current tracing span for context.
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
|
||||
/// using the current tracing span for context.
|
||||
///
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
|
||||
/// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`].
|
||||
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut(usize) -> O, // takes attempt number, starting at 0
|
||||
F: FnMut(usize) -> O, // pass attempt number, starting at 0
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
let started = Instant::now();
|
||||
let deadline = started + Self::TOTAL_TIMEOUT;
|
||||
let deadline = self.timeout.map(|timeout| started + timeout);
|
||||
let mut last_error = None;
|
||||
let mut retries = 0;
|
||||
loop {
|
||||
// Set up a future to wait for the backoff (if any) and run the request with a timeout.
|
||||
// Set up a future to wait for the backoff, if any, and run the closure.
|
||||
let backoff_and_try = async {
|
||||
// NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
|
||||
// https://github.com/tokio-rs/tokio/issues/6866
|
||||
if let Some(backoff) = Self::backoff_duration(retries) {
|
||||
if let Some(backoff) = self.backoff_duration(retries) {
|
||||
tokio::time::sleep(backoff).await;
|
||||
}
|
||||
|
||||
let request_started = Instant::now();
|
||||
tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
request_started.elapsed().as_secs_f64()
|
||||
))
|
||||
})?
|
||||
f(retries).await
|
||||
};
|
||||
|
||||
// Wait for the backoff and request, or bail out if the total timeout is exceeded.
|
||||
// Set up a future for the timeout, if any.
|
||||
let timeout = async {
|
||||
match deadline {
|
||||
Some(deadline) => tokio::time::sleep_until(deadline).await,
|
||||
None => pending().await,
|
||||
}
|
||||
};
|
||||
|
||||
// Wait for the backoff and request, or bail out if the timeout is exceeded.
|
||||
let result = tokio::select! {
|
||||
result = backoff_and_try => result,
|
||||
|
||||
_ = tokio::time::sleep_until(deadline) => {
|
||||
_ = timeout => {
|
||||
let last_error = last_error.unwrap_or_else(|| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
@@ -79,7 +74,7 @@ impl Retry {
|
||||
match result {
|
||||
// Success, return the result.
|
||||
Ok(result) => {
|
||||
if retries > 0 || Self::LOG_SUCCESS {
|
||||
if retries > 0 {
|
||||
info!(
|
||||
"request succeeded after {retries} retries in {:.3}s",
|
||||
started.elapsed().as_secs_f64(),
|
||||
@@ -112,12 +107,13 @@ impl Retry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the backoff duration for the given retry attempt, or None for no backoff.
|
||||
fn backoff_duration(retry: usize) -> Option<Duration> {
|
||||
/// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
|
||||
/// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
|
||||
fn backoff_duration(&self, retries: usize) -> Option<Duration> {
|
||||
let backoff = exponential_backoff_duration(
|
||||
retry as u32,
|
||||
Self::BASE_BACKOFF.as_secs_f64(),
|
||||
Self::MAX_BACKOFF.as_secs_f64(),
|
||||
(retries as u32).saturating_sub(1), // first retry does not back off
|
||||
self.base_backoff.as_secs_f64(),
|
||||
self.max_backoff.as_secs_f64(),
|
||||
);
|
||||
(!backoff.is_zero()).then_some(backoff)
|
||||
}
|
||||
|
||||
@@ -33,8 +33,6 @@ pub enum ProtocolError {
|
||||
Invalid(&'static str, String),
|
||||
#[error("required field '{0}' is missing")]
|
||||
Missing(&'static str),
|
||||
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
|
||||
InvalidLsns(Lsn, Lsn),
|
||||
}
|
||||
|
||||
impl ProtocolError {
|
||||
@@ -51,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
|
||||
}
|
||||
|
||||
/// The LSN a request should read at.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct ReadLsn {
|
||||
/// The request's read LSN.
|
||||
pub request_lsn: Lsn,
|
||||
@@ -87,9 +85,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
|
||||
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
|
||||
}
|
||||
if pb.not_modified_since_lsn > pb.request_lsn {
|
||||
return Err(ProtocolError::InvalidLsns(
|
||||
Lsn(pb.not_modified_since_lsn),
|
||||
Lsn(pb.request_lsn),
|
||||
return Err(ProtocolError::invalid(
|
||||
"not_modified_since_lsn",
|
||||
pb.not_modified_since_lsn,
|
||||
));
|
||||
}
|
||||
Ok(Self {
|
||||
@@ -331,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
|
||||
}
|
||||
|
||||
/// Requests one or more pages.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct GetPageRequest {
|
||||
/// A request ID. Will be included in the response. Should be unique for in-flight requests on
|
||||
/// the stream.
|
||||
@@ -432,12 +430,13 @@ impl From<RequestID> for proto::RequestId {
|
||||
}
|
||||
|
||||
/// A GetPage request class.
|
||||
#[derive(Clone, Copy, Debug, strum_macros::Display)]
|
||||
#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
|
||||
pub enum GetPageClass {
|
||||
/// Unknown class. For backwards compatibility: used when an older client version sends a class
|
||||
/// that a newer server version has removed.
|
||||
Unknown,
|
||||
/// A normal request. This is the default.
|
||||
#[default]
|
||||
Normal,
|
||||
/// A prefetch request. NB: can only be classified on pg < 18.
|
||||
Prefetch,
|
||||
@@ -445,19 +444,6 @@ pub enum GetPageClass {
|
||||
Background,
|
||||
}
|
||||
|
||||
impl GetPageClass {
|
||||
/// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
|
||||
/// latency-sensitive).
|
||||
pub fn is_bulk(&self) -> bool {
|
||||
match self {
|
||||
Self::Unknown => false,
|
||||
Self::Normal => false,
|
||||
Self::Prefetch => true,
|
||||
Self::Background => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<proto::GetPageClass> for GetPageClass {
|
||||
fn from(pb: proto::GetPageClass) -> Self {
|
||||
match pb {
|
||||
|
||||
@@ -16,6 +16,7 @@ futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -24,9 +25,6 @@ tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
axum.workspace = true
|
||||
http.workspace = true
|
||||
metrics.workspace = true
|
||||
tonic.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
|
||||
@@ -34,10 +34,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc_stream: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
|
||||
@@ -82,9 +78,6 @@ pub(crate) struct Args {
|
||||
#[clap(long)]
|
||||
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
|
||||
/// Queue depth generated in each client.
|
||||
#[clap(long, default_value = "1")]
|
||||
queue_depth: NonZeroUsize,
|
||||
@@ -99,31 +92,10 @@ pub(crate) struct Args {
|
||||
#[clap(long, default_value = "1")]
|
||||
batch_size: NonZeroUsize,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
|
||||
#[clap(long, default_value = "100")]
|
||||
pool_max_consumers: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "5")]
|
||||
pool_error_threshold: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "5000")]
|
||||
pool_connect_timeout: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "1000")]
|
||||
pool_connect_backoff: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "60000")]
|
||||
pool_max_idle_duration: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
max_delay_ms: usize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
percent_drops: usize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
percent_hangs: usize,
|
||||
}
|
||||
|
||||
/// State shared by all clients
|
||||
@@ -180,6 +152,7 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
@@ -344,7 +317,6 @@ async fn main_impl(
|
||||
let rps_period = args
|
||||
.per_client_rate
|
||||
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
|
||||
|
||||
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
|
||||
let ss = shared_state.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
127
pageserver/pagebench/src/cmd/idle_streams.rs
Normal file
127
pageserver/pagebench/src/cmd/idle_streams.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use futures::StreamExt;
|
||||
use tonic::transport::Endpoint;
|
||||
use tracing::info;
|
||||
|
||||
use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// Starts a large number of idle gRPC GetPage streams.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
/// The Pageserver to connect to. Must use grpc://.
|
||||
#[clap(long, default_value = "grpc://localhost:51051")]
|
||||
server: String,
|
||||
/// The Pageserver HTTP API.
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
http_server: String,
|
||||
/// The number of streams to open.
|
||||
#[clap(long, default_value = "100000")]
|
||||
count: usize,
|
||||
/// Number of streams per connection.
|
||||
#[clap(long, default_value = "100")]
|
||||
per_connection: usize,
|
||||
/// Send a single GetPage request on each stream.
|
||||
#[clap(long, default_value_t = false)]
|
||||
send_request: bool,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
rt.block_on(main_impl(args))
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
// Discover a tenant and timeline to use.
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
reqwest::Client::new(),
|
||||
args.http_server.clone(),
|
||||
None,
|
||||
));
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: Some(1),
|
||||
targets: None,
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let ttid = timelines
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("no timelines found"))?;
|
||||
|
||||
// Set up the initial client.
|
||||
let endpoint = Endpoint::from_shared(args.server.clone())?;
|
||||
|
||||
let connect = async || {
|
||||
pageserver_page_api::Client::new(
|
||||
endpoint.connect().await?,
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardIndex::unsharded(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
};
|
||||
|
||||
let mut client = connect().await?;
|
||||
let mut streams = Vec::with_capacity(args.count);
|
||||
|
||||
// Create streams.
|
||||
for i in 0..args.count {
|
||||
if i % 100 == 0 {
|
||||
info!("opened {}/{} streams", i, args.count);
|
||||
}
|
||||
if i % args.per_connection == 0 && i > 0 {
|
||||
client = connect().await?;
|
||||
}
|
||||
|
||||
let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Send request if specified.
|
||||
if args.send_request {
|
||||
req_tx.send(GetPageRequest {
|
||||
request_id: 1.into(),
|
||||
request_class: GetPageClass::Normal,
|
||||
read_lsn: ReadLsn {
|
||||
request_lsn: Lsn::MAX,
|
||||
not_modified_since_lsn: Some(Lsn(1)),
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: 1664, // pg_global
|
||||
dbnode: 0, // shared database
|
||||
relnode: 1262, // pg_authid
|
||||
forknum: 0, // init
|
||||
},
|
||||
block_numbers: vec![0],
|
||||
})?;
|
||||
let resp = resp_stream
|
||||
.next()
|
||||
.await
|
||||
.transpose()?
|
||||
.ok_or_else(|| anyhow!("no response"))?;
|
||||
if resp.status_code != GetPageStatusCode::Ok {
|
||||
return Err(anyhow!("{} response", resp.status_code));
|
||||
}
|
||||
}
|
||||
|
||||
// Hold onto streams to avoid closing them.
|
||||
streams.push((req_tx, resp_stream));
|
||||
}
|
||||
|
||||
info!("opened {} streams, sleeping", args.count);
|
||||
|
||||
// Block forever, to hold the idle streams open for inspection.
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::fs::File;
|
||||
|
||||
use clap::Parser;
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
@@ -17,38 +20,73 @@ mod cmd {
|
||||
pub(super) mod aux_files;
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod idle_streams;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
struct Args {
|
||||
/// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
|
||||
/// written, e.g. via --runtime.
|
||||
#[arg(long)]
|
||||
profile: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
subcommand: Subcommand,
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
enum Subcommand {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
AuxFiles(cmd::aux_files::Args),
|
||||
IdleStreams(cmd::idle_streams::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
|
||||
// Start a CPU profile if requested.
|
||||
let mut profiler = None;
|
||||
if args.profile {
|
||||
profiler = Some(
|
||||
pprof::ProfilerGuardBuilder::default()
|
||||
.frequency(1000)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()?,
|
||||
);
|
||||
}
|
||||
|
||||
match args.subcommand {
|
||||
Subcommand::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Subcommand::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
|
||||
}?;
|
||||
|
||||
// Generate a CPU flamegraph if requested.
|
||||
if let Some(profiler) = profiler {
|
||||
let report = profiler.report().build()?;
|
||||
drop(profiler); // stop profiling
|
||||
let file = File::create("profile.svg")?;
|
||||
report.flamegraph(file)?;
|
||||
info!("wrote CPU profile flamegraph to profile.svg")
|
||||
}
|
||||
.unwrap()
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ where
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
// "zenith.signal" file, so that postgres can read it during startup.
|
||||
// "neon.signal" file, so that postgres can read it during startup.
|
||||
//
|
||||
// We don't keep full history of record boundaries in the page server,
|
||||
// however, only the predecessor of the latest record on each
|
||||
@@ -751,34 +751,39 @@ where
|
||||
|
||||
//
|
||||
// Add generated pg_control file and bootstrap WAL segment.
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
// Also send neon.signal and zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
async fn add_pgcontrol_file(
|
||||
&mut self,
|
||||
pg_control_bytes: Bytes,
|
||||
system_identifier: u64,
|
||||
) -> Result<(), BasebackupError> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
// add neon.signal file
|
||||
let mut neon_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.timeline.is_ancestor_lsn(self.lsn) {
|
||||
write!(zenith_signal, "PREV LSN: none")
|
||||
write!(neon_signal, "PREV LSN: none")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")
|
||||
write!(neon_signal, "PREV LSN: invalid")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
|
||||
|
||||
// TODO: Remove zenith.signal once all historical computes have been replaced
|
||||
// ... and thus support the neon.signal file.
|
||||
for signalfilename in ["neon.signal", "zenith.signal"] {
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header(signalfilename, neon_signal.len() as u64)?,
|
||||
neon_signal.as_bytes(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
|
||||
}
|
||||
|
||||
//send pg_control
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
|
||||
@@ -917,11 +917,6 @@ async fn create_remote_storage_client(
|
||||
// If `test_remote_failures` is non-zero, wrap the client with a
|
||||
// wrapper that simulates failures.
|
||||
if conf.test_remote_failures > 0 {
|
||||
if !cfg!(feature = "testing") {
|
||||
anyhow::bail!(
|
||||
"test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
|
||||
);
|
||||
}
|
||||
info!(
|
||||
"Simulating remote failures for first {} attempts of each op",
|
||||
conf.test_remote_failures
|
||||
|
||||
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
availability_zone_id: az_id.expect("Checked above"),
|
||||
node_ip_addr: None,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! The validator is responsible for validating DeletionLists for execution,
|
||||
//! based on whethe the generation in the DeletionList is still the latest
|
||||
//! based on whether the generation in the DeletionList is still the latest
|
||||
//! generation for a tenant.
|
||||
//!
|
||||
//! The purpose of validation is to ensure split-brain safety in the cluster
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use futures::future::join_all;
|
||||
use futures::{StreamExt, TryFutureExt};
|
||||
@@ -46,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
@@ -57,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
@@ -77,6 +80,7 @@ use crate::tenant::remote_timeline_client::{
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::ValuesReconstructState;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
@@ -397,6 +401,7 @@ async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
force_await_initial_logical_size: bool,
|
||||
include_image_consistent_lsn: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -421,6 +426,10 @@ async fn build_timeline_info(
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
// HADRON
|
||||
if include_image_consistent_lsn {
|
||||
info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
|
||||
}
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
@@ -510,6 +519,8 @@ async fn build_timeline_info_common(
|
||||
is_invisible: Some(is_invisible),
|
||||
|
||||
walreceiver_status,
|
||||
// HADRON
|
||||
image_consistent_lsn: None,
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -712,6 +723,8 @@ async fn timeline_list_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -732,6 +745,7 @@ async fn timeline_list_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
@@ -760,6 +774,9 @@ async fn timeline_and_offloaded_list_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -780,6 +797,7 @@ async fn timeline_and_offloaded_list_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
@@ -964,6 +982,9 @@ async fn timeline_detail_handler(
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
// HADRON
|
||||
let include_image_consistent_lsn: Option<bool> =
|
||||
parse_query_param(&request, "include-image-consistent-lsn")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Logical size calculation needs downloading.
|
||||
@@ -984,6 +1005,7 @@ async fn timeline_detail_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
include_image_consistent_lsn.unwrap_or(false),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2690,6 +2712,16 @@ async fn deletion_queue_flush(
|
||||
}
|
||||
}
|
||||
|
||||
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct GetPageResponse {
|
||||
pub page: Bytes,
|
||||
pub layers_visited: u32,
|
||||
pub delta_layers_visited: u32,
|
||||
pub records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
async fn getpage_at_lsn_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
@@ -2740,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(
|
||||
|
||||
// Use last_record_lsn if no lsn is provided
|
||||
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||
|
||||
if touch {
|
||||
json_response(StatusCode::OK, ())
|
||||
} else {
|
||||
Result::<_, ApiError>::Ok(
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(page))
|
||||
.unwrap(),
|
||||
)
|
||||
let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
|
||||
let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
|
||||
let response = GetPageResponse {
|
||||
page,
|
||||
layers_visited: reconstruct_state.get_layers_visited(),
|
||||
delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
|
||||
records: reconstruct_state.debug_state.records.clone(),
|
||||
img: reconstruct_state.debug_state.img.clone(),
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -3643,6 +3678,7 @@ async fn activate_post_import_handler(
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline, false, // include_non_incremental_logical_size,
|
||||
false, // force_await_initial_logical_size
|
||||
false, // include_image_consistent_lsn
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -4164,7 +4200,7 @@ pub fn make_router(
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|
||||
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
|
||||
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
|
||||
|
||||
@@ -610,13 +610,13 @@ async fn import_file(
|
||||
debug!("imported twophase file");
|
||||
} else if file_path.starts_with("pg_wal") {
|
||||
debug!("found wal file in base section. ignore it");
|
||||
} else if file_path.starts_with("zenith.signal") {
|
||||
} else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
|
||||
// Parse zenith signal file to set correct previous LSN
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||
// neon.signal format is "PREV LSN: prev_lsn"
|
||||
// TODO write serialization and deserialization in the same place.
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match zenith_signal {
|
||||
let neon_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match neon_signal {
|
||||
"PREV LSN: none" => Lsn(0),
|
||||
"PREV LSN: invalid" => Lsn(0),
|
||||
other => {
|
||||
@@ -624,17 +624,17 @@ async fn import_file(
|
||||
split[1]
|
||||
.trim()
|
||||
.parse::<Lsn>()
|
||||
.context("can't parse zenith.signal")?
|
||||
.context("can't parse neon.signal")?
|
||||
}
|
||||
};
|
||||
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// neon.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.writer().await;
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
debug!("imported zenith signal {}", prev_lsn);
|
||||
debug!("imported neon signal {}", prev_lsn);
|
||||
} else if file_path.starts_with("pg_tblspc") {
|
||||
// TODO Backups exported from neon won't have pg_tblspc, but we will need
|
||||
// this to import arbitrary postgres databases.
|
||||
|
||||
@@ -3218,7 +3218,6 @@ where
|
||||
pub struct GrpcPageServiceHandler {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
}
|
||||
@@ -3271,7 +3270,6 @@ impl GrpcPageServiceHandler {
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
cancel: cancel.clone(),
|
||||
gate_guard: gate.enter().expect("gate was just created"),
|
||||
get_vectored_concurrent_io,
|
||||
};
|
||||
@@ -3408,8 +3406,6 @@ impl GrpcPageServiceHandler {
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
///
|
||||
/// TODO: verify that the given keys belong to this shard.
|
||||
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
|
||||
async fn get_page(
|
||||
ctx: &RequestContext,
|
||||
@@ -3731,7 +3727,6 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
// Spawn a task to handle the GetPageRequest stream.
|
||||
let span = Span::current();
|
||||
let ctx = self.ctx.attached_child();
|
||||
let cancel = self.cancel.clone();
|
||||
let mut reqs = req.into_inner();
|
||||
|
||||
let resps = async_stream::try_stream! {
|
||||
@@ -3739,18 +3734,8 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?
|
||||
.downgrade();
|
||||
|
||||
loop {
|
||||
let req = tokio::select! {
|
||||
req = reqs.message() => req,
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("closing getpages stream due to shutdown");
|
||||
break;
|
||||
},
|
||||
};
|
||||
let Some(req) = req? else { break };
|
||||
while let Some(req) = reqs.message().await? {
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
|
||||
@@ -25,9 +25,9 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId};
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi_types::{Oid, RepOriginId};
|
||||
use postgres_ffi_types::{Oid, RepOriginId, TimestampTz};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
@@ -3393,7 +3393,13 @@ impl TenantShard {
|
||||
.collect_vec();
|
||||
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
// Include a span with the timeline ID. The parent span already has the tenant ID.
|
||||
let span =
|
||||
info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
|
||||
timeline
|
||||
.maybe_freeze_ephemeral_layer()
|
||||
.instrument(span)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12816,6 +12822,40 @@ mod tests {
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
pitr_interval: Some(Duration::from_secs(7 * 3600)),
|
||||
image_layer_force_creation_period: Some(Duration::from_secs(3600)),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let harness = TenantHarness::create_custom(
|
||||
"test_get_force_image_creation_lsn",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
|
||||
{
|
||||
let writer = timeline.writer().await;
|
||||
writer.finish_write(Lsn(5000));
|
||||
}
|
||||
|
||||
let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
|
||||
assert_eq!(image_creation_lsn, Lsn(4300));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,10 +46,11 @@
|
||||
mod historic_layer_coverage;
|
||||
mod layer_coverage;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::Result;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
@@ -904,6 +905,103 @@ impl LayerMap {
|
||||
max_stacked_deltas
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
/**
|
||||
* Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
|
||||
* It works by first finding the latest image layers and store them into a map. Then for each delta layer,
|
||||
* find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
|
||||
* (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
|
||||
* image LSN to 150 because there is no WAL record in between).
|
||||
* Finally, the image consistent LSN is computed by taking the minimum of all image layers.
|
||||
*/
|
||||
pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
|
||||
struct ImageLayerInfo {
|
||||
// creation LSN of the image layer
|
||||
image_lsn: Lsn,
|
||||
// the current minimum LSN of newer delta layers with overlapping key ranges
|
||||
min_delta_lsn: Lsn,
|
||||
}
|
||||
let started_at = Instant::now();
|
||||
|
||||
let min_l0_deltas_lsn = {
|
||||
let l0_deltas = self.level0_deltas();
|
||||
l0_deltas
|
||||
.iter()
|
||||
.map(|layer| layer.get_lsn_range().start)
|
||||
.min()
|
||||
.unwrap_or(disk_consistent_lsn)
|
||||
};
|
||||
let global_key_range = Key::MIN..Key::MAX;
|
||||
|
||||
// step 1: collect all most recent image layers into a map
|
||||
// map: end key to image_layer_info
|
||||
let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
|
||||
for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
|
||||
let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
|
||||
image_map.insert(
|
||||
img_range.end,
|
||||
ImageLayerInfo {
|
||||
image_lsn: img_lsn,
|
||||
min_delta_lsn: min_l0_deltas_lsn,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// step 2: go through all delta layers, and update the image layer info with overlapping
|
||||
// key ranges
|
||||
for layer in self.historic.iter() {
|
||||
if !layer.is_delta {
|
||||
continue;
|
||||
}
|
||||
let delta_key_range = layer.get_key_range();
|
||||
let delta_lsn_range = layer.get_lsn_range();
|
||||
for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
|
||||
debug_assert!(img_end_key >= &delta_key_range.start);
|
||||
if delta_lsn_range.end > img_info.image_lsn {
|
||||
// the delta layer includes WAL records after the image
|
||||
// it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
|
||||
img_info.min_delta_lsn =
|
||||
std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
|
||||
}
|
||||
if img_end_key >= &delta_key_range.end {
|
||||
// we have fully processed all overlapping image layers
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 3, go through all image layers and find the image consistent LSN
|
||||
let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
|
||||
let mut prev_key = Key::MIN;
|
||||
for (img_key, img_info) in image_map {
|
||||
tracing::debug!(
|
||||
"Image layer {:?}:{} has min delta lsn {}",
|
||||
Range {
|
||||
start: prev_key,
|
||||
end: img_key,
|
||||
},
|
||||
img_info.image_lsn,
|
||||
img_info.min_delta_lsn,
|
||||
);
|
||||
let image_lsn = std::cmp::max(
|
||||
img_info.image_lsn,
|
||||
img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
|
||||
);
|
||||
img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
|
||||
prev_key = img_key;
|
||||
}
|
||||
tracing::info!(
|
||||
"computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
|
||||
img_consistent_lsn,
|
||||
disk_consistent_lsn,
|
||||
started_at.elapsed().as_millis(),
|
||||
self.historic.len()
|
||||
);
|
||||
img_consistent_lsn
|
||||
}
|
||||
|
||||
/* END_HADRON */
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
|
||||
&self.l0_delta_layers
|
||||
@@ -1579,6 +1677,138 @@ mod tests {
|
||||
LayerVisibilityHint::Visible
|
||||
));
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
#[test]
|
||||
fn test_compute_image_consistent_lsn() {
|
||||
let mut layer_map = LayerMap::default();
|
||||
|
||||
let disk_consistent_lsn = Lsn(1000);
|
||||
// case 1: empty layer map
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(
|
||||
disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
|
||||
image_consistent_lsn
|
||||
);
|
||||
|
||||
// case 2: only L0 delta layer
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(100),
|
||||
Lsn(900)..Lsn(990),
|
||||
true,
|
||||
));
|
||||
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(100),
|
||||
Lsn(850)..Lsn(899),
|
||||
true,
|
||||
));
|
||||
}
|
||||
|
||||
// should use min L0 delta LSN - 1 as image consistent LSN
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(849), image_consistent_lsn);
|
||||
|
||||
// case 3: 3 images, no L1 delta
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(40),
|
||||
Lsn(100)..Lsn(100),
|
||||
false,
|
||||
));
|
||||
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(40)..Key::from_i128(70),
|
||||
Lsn(200)..Lsn(200),
|
||||
false,
|
||||
));
|
||||
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(70)..Key::from_i128(100),
|
||||
Lsn(150)..Lsn(150),
|
||||
false,
|
||||
));
|
||||
}
|
||||
// should use min L0 delta LSN - 1 as image consistent LSN
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(849), image_consistent_lsn);
|
||||
|
||||
// case 4: 3 images with 1 L1 delta
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(50),
|
||||
Lsn(300)..Lsn(350),
|
||||
true,
|
||||
));
|
||||
}
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(299), image_consistent_lsn);
|
||||
|
||||
// case 5: 3 images with 1 more L1 delta with smaller LSN
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(50)..Key::from_i128(72),
|
||||
Lsn(200)..Lsn(300),
|
||||
true,
|
||||
));
|
||||
}
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(199), image_consistent_lsn);
|
||||
|
||||
// case 6: 3 images with more newer L1 deltas (no impact on final results)
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(30),
|
||||
Lsn(400)..Lsn(500),
|
||||
true,
|
||||
));
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(35)..Key::from_i128(100),
|
||||
Lsn(450)..Lsn(600),
|
||||
true,
|
||||
));
|
||||
}
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(199), image_consistent_lsn);
|
||||
|
||||
// case 7: 3 images with more older L1 deltas (no impact on final results)
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(40),
|
||||
Lsn(0)..Lsn(50),
|
||||
true,
|
||||
));
|
||||
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(50)..Key::from_i128(100),
|
||||
Lsn(10)..Lsn(60),
|
||||
true,
|
||||
));
|
||||
}
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(199), image_consistent_lsn);
|
||||
|
||||
// case 8: 3 images with one more L1 delta with overlapping LSN range
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_test(
|
||||
Key::from_i128(0)..Key::from_i128(50),
|
||||
Lsn(50)..Lsn(250),
|
||||
true,
|
||||
));
|
||||
}
|
||||
let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
|
||||
assert_eq!(Lsn(100), image_consistent_lsn);
|
||||
}
|
||||
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1678,6 +1678,8 @@ impl TenantManager {
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
drop(parent_slot_guard);
|
||||
|
||||
utils::pausable_failpoint!("shard-split-post-finish-pause");
|
||||
|
||||
Ok(child_shards)
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ where
|
||||
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
|
||||
/// call, to collect more records.
|
||||
///
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub(crate) struct ValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
@@ -308,6 +308,9 @@ pub struct ValuesReconstructState {
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
|
||||
pub(crate) enable_debug: bool,
|
||||
pub(crate) debug_state: ValueReconstructState,
|
||||
|
||||
pub(crate) io_concurrency: IoConcurrency,
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
|
||||
@@ -657,6 +660,23 @@ impl ValuesReconstructState {
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
enable_debug: false,
|
||||
debug_state: ValueReconstructState::default(),
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
read_path: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
enable_debug: true,
|
||||
debug_state: ValueReconstructState::default(),
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
read_path: None,
|
||||
}
|
||||
@@ -670,6 +690,12 @@ impl ValuesReconstructState {
|
||||
self.io_concurrency.spawn_io(fut).await;
|
||||
}
|
||||
|
||||
pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) {
|
||||
if self.enable_debug {
|
||||
self.debug_state = debug_state.clone();
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
|
||||
self.layers_visited += 1;
|
||||
if let ReadableLayer::PersistentLayer(layer) = layer {
|
||||
|
||||
@@ -351,13 +351,6 @@ pub struct Timeline {
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
// HADRON
|
||||
/// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
|
||||
/// on this key range.
|
||||
force_image_creation_lsn: AtomicLsn,
|
||||
/// The last time instant when force_image_creation_lsn is computed.
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -1260,6 +1253,57 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) async fn debug_get(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||
}
|
||||
|
||||
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
|
||||
// already checked the key against the shard_identity when looking up the Timeline from
|
||||
// page_service.
|
||||
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
let vectored_res = self
|
||||
.debug_get_vectored_impl(query, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
let key_value = vectored_res?.pop_first();
|
||||
match key_value {
|
||||
Some((got_key, value)) => {
|
||||
if got_key != key {
|
||||
error!(
|
||||
"Expected {}, but singular vectored get returned {}",
|
||||
key, got_key
|
||||
);
|
||||
Err(PageReconstructError::Other(anyhow!(
|
||||
"Singular vectored get returned wrong key"
|
||||
)))
|
||||
} else {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(Box::new(
|
||||
MissingKeyError {
|
||||
keyspace: KeySpace::single(key..key.next()),
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
original_hwm_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
query: None,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
|
||||
|
||||
/// Look up multiple page versions at a given LSN
|
||||
@@ -1554,6 +1598,98 @@ impl Timeline {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
// A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`.
|
||||
// This is only used in the http getpage call for debugging purpose.
|
||||
pub(super) async fn debug_get_vectored_impl(
|
||||
&self,
|
||||
query: VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if query.is_empty() {
|
||||
return Ok(BTreeMap::default());
|
||||
}
|
||||
|
||||
let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
|
||||
Some(ReadPath::new(
|
||||
query.total_keyspace(),
|
||||
query.high_watermark_lsn()?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let traversal_res: Result<(), _> = self
|
||||
.get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
if let Err(err) = traversal_res {
|
||||
// Wait for all the spawned IOs to complete.
|
||||
// See comments on `spawn_io` inside `storage_layer` for more details.
|
||||
let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
|
||||
.into_values()
|
||||
.map(|state| state.collect_pending_ios())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
while collect_futs.next().await.is_some() {}
|
||||
return Err(err);
|
||||
};
|
||||
|
||||
let reconstruct_state = Arc::new(Mutex::new(reconstruct_state));
|
||||
let futs = FuturesUnordered::new();
|
||||
|
||||
for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) {
|
||||
let req_lsn_for_key = query.map_key_to_lsn(&key);
|
||||
futs.push({
|
||||
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
|
||||
let rc_clone = Arc::clone(&reconstruct_state);
|
||||
|
||||
async move {
|
||||
assert_eq!(state.situation, ValueReconstructSituation::Complete);
|
||||
|
||||
let converted = match state.collect_pending_ios().await {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
return (key, Err(err));
|
||||
}
|
||||
};
|
||||
DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
|
||||
|
||||
// The walredo module expects the records to be descending in terms of Lsn.
|
||||
// And we submit the IOs in that order, so, there shuold be no need to sort here.
|
||||
debug_assert!(
|
||||
converted
|
||||
.records
|
||||
.is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
|
||||
"{converted:?}"
|
||||
);
|
||||
{
|
||||
let mut guard = rc_clone.lock().unwrap();
|
||||
guard.set_debug_state(&converted);
|
||||
}
|
||||
(
|
||||
key,
|
||||
walredo_self
|
||||
.reconstruct_value(
|
||||
key,
|
||||
req_lsn_for_key,
|
||||
converted,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.await,
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let results = futs
|
||||
.collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
|
||||
.await;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub(crate) fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
@@ -1900,6 +2036,8 @@ impl Timeline {
|
||||
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
|
||||
// ephemeral layer bytes has been breached.
|
||||
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let Ok(mut write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
@@ -2854,7 +2992,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// HADRON
|
||||
fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
@@ -3134,9 +3272,6 @@ impl Timeline {
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
last_image_layer_creation_check_instant: Mutex::new(None),
|
||||
// HADRON
|
||||
force_image_creation_lsn: AtomicLsn::new(0),
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_latest_cache: RwLock::new(HashMap::new()),
|
||||
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
|
||||
@@ -5381,13 +5516,16 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// HADRON
|
||||
// for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
|
||||
min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
|
||||
if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
|
||||
info!(
|
||||
"forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
|
||||
"forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
|
||||
partition.ranges[0].start,
|
||||
partition.ranges[0].end,
|
||||
min_image_lsn,
|
||||
force_image_creation_lsn.unwrap()
|
||||
force_image_creation_lsn.unwrap(),
|
||||
max_deltas
|
||||
);
|
||||
return true;
|
||||
}
|
||||
@@ -5611,10 +5749,11 @@ impl Timeline {
|
||||
/// Predicate function which indicates whether we should check if new image layers
|
||||
/// are required. Since checking if new image layers are required is expensive in
|
||||
/// terms of CPU, we only do it in the following cases:
|
||||
/// 1. If the timeline has ingested sufficient WAL to justify the cost
|
||||
/// 1. If the timeline has ingested sufficient WAL to justify the cost or ...
|
||||
/// 2. If enough time has passed since the last check:
|
||||
/// 1. For large tenants, we wish to perform the check more often since they
|
||||
/// suffer from the lack of image layers
|
||||
/// suffer from the lack of image layers. Note that we assume sharded tenants
|
||||
/// to be large since non-zero shards do not track the logical size.
|
||||
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
|
||||
let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
|
||||
@@ -5628,30 +5767,39 @@ impl Timeline {
|
||||
|
||||
let distance_based_decision = distance.0 >= min_distance;
|
||||
|
||||
let mut time_based_decision = false;
|
||||
let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
|
||||
if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
|
||||
let check_required_after =
|
||||
if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
|
||||
self.get_checkpoint_timeout()
|
||||
} else {
|
||||
Duration::from_secs(3600 * 48)
|
||||
};
|
||||
|
||||
time_based_decision = match *last_check_instant {
|
||||
Some(last_check) => {
|
||||
let elapsed = last_check.elapsed();
|
||||
elapsed >= check_required_after
|
||||
let check_required_after = (|| {
|
||||
if self.shard_identity.is_unsharded() {
|
||||
if let CurrentLogicalSize::Exact(logical_size) =
|
||||
self.current_logical_size.current_size()
|
||||
{
|
||||
if Some(Into::<u64>::into(&logical_size)) < large_timeline_threshold {
|
||||
return Duration::from_secs(3600 * 48);
|
||||
}
|
||||
}
|
||||
None => true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
self.get_checkpoint_timeout()
|
||||
})();
|
||||
|
||||
let time_based_decision = match *last_check_instant {
|
||||
Some(last_check) => {
|
||||
let elapsed = last_check.elapsed();
|
||||
elapsed >= check_required_after
|
||||
}
|
||||
None => true,
|
||||
};
|
||||
|
||||
// Do the expensive delta layer counting only if this timeline has ingested sufficient
|
||||
// WAL since the last check or a checkpoint timeout interval has elapsed since the last
|
||||
// check.
|
||||
let decision = distance_based_decision || time_based_decision;
|
||||
|
||||
tracing::info!(
|
||||
"Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}",
|
||||
decision,
|
||||
distance_based_decision,
|
||||
time_based_decision
|
||||
);
|
||||
if decision {
|
||||
self.last_image_layer_creation_check_at.store(lsn);
|
||||
*last_check_instant = Some(Instant::now());
|
||||
@@ -7153,6 +7301,19 @@ impl Timeline {
|
||||
.unwrap()
|
||||
.clone()
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::ComputeImageConsistentLsn)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let disk_consistent_lsn = self.get_disk_consistent_lsn();
|
||||
|
||||
Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::cmp::min;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::layer_manager::LayerManagerLockHolder;
|
||||
use super::{
|
||||
@@ -34,7 +34,6 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
|
||||
use pageserver_compaction::interface::*;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -47,7 +46,6 @@ use wal_decoder::models::value::Value;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
@@ -1271,10 +1269,7 @@ impl Timeline {
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// HADRON
|
||||
let force_image_creation_lsn = self
|
||||
.get_or_compute_force_image_creation_lsn(cancel, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let force_image_creation_lsn = self.get_force_image_creation_lsn();
|
||||
|
||||
// 1. L0 Compact
|
||||
let l0_outcome = {
|
||||
@@ -1484,59 +1479,37 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// Get the force image creation LSN. Compute it if the last computed LSN is too old.
|
||||
async fn get_or_compute_force_image_creation_lsn(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Lsn>> {
|
||||
const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
|
||||
let image_layer_force_creation_period = self.get_image_creation_timeout();
|
||||
if image_layer_force_creation_period.is_none() {
|
||||
return Ok(None);
|
||||
// Get the force image creation LSN based on gc_cutoff_lsn.
|
||||
// Note that this is an estimation and the workload rate may suddenly change. When that happens,
|
||||
// the force image creation may be too early or too late, but eventually it should be able to catch up.
|
||||
pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
|
||||
let image_creation_period = self.get_image_layer_force_creation_period()?;
|
||||
let current_lsn = self.get_last_record_lsn();
|
||||
let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
|
||||
let pitr_interval = self.get_pitr_interval();
|
||||
if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
|
||||
tracing::warn!(
|
||||
"pitr LSN/interval not found, skipping force image creation LSN calculation"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
|
||||
let force_image_creation_lsn_computed_at =
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap();
|
||||
if force_image_creation_lsn_computed_at.is_none()
|
||||
|| force_image_creation_lsn_computed_at.unwrap().elapsed()
|
||||
> FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
|
||||
{
|
||||
let now: SystemTime = SystemTime::now();
|
||||
let timestamp = now
|
||||
.checked_sub(image_layer_force_creation_period)
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"image creation timeout is too large: {image_layer_force_creation_period:?}"
|
||||
)
|
||||
})?;
|
||||
let timestamp = to_pg_timestamp(timestamp);
|
||||
let force_image_creation_lsn = match self
|
||||
.find_lsn_for_timestamp(timestamp, cancel, ctx)
|
||||
.await?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
|
||||
_ => {
|
||||
let gc_lsn = *self.get_applied_gc_cutoff_lsn();
|
||||
tracing::info!(
|
||||
"no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
|
||||
gc_lsn
|
||||
);
|
||||
gc_lsn
|
||||
}
|
||||
};
|
||||
self.force_image_creation_lsn
|
||||
.store(force_image_creation_lsn);
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
|
||||
tracing::info!(
|
||||
"computed force image creation LSN: {}",
|
||||
force_image_creation_lsn
|
||||
);
|
||||
Ok(Some(force_image_creation_lsn))
|
||||
} else {
|
||||
Ok(Some(self.force_image_creation_lsn.load()))
|
||||
}
|
||||
let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
|
||||
* image_creation_period.as_secs()
|
||||
/ pitr_interval.as_secs();
|
||||
let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
|
||||
|
||||
tracing::info!(
|
||||
"Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
|
||||
self.tenant_shard_id,
|
||||
force_image_creation_lsn,
|
||||
current_lsn,
|
||||
image_creation_period,
|
||||
pitr_lsn,
|
||||
pitr_interval
|
||||
);
|
||||
|
||||
Some(force_image_creation_lsn)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
|
||||
@@ -359,14 +359,14 @@ impl<T: Types> Cache<T> {
|
||||
Err(e) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tracing::warn!(
|
||||
"Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
|
||||
attempt,
|
||||
e
|
||||
);
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
continue;
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Failed to resolve tenant shard after {} attempts: {:?}",
|
||||
GET_MAX_RETRIES,
|
||||
e
|
||||
);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
|
||||
ImportPgData,
|
||||
DetachAncestor,
|
||||
Eviction,
|
||||
ComputeImageConsistentLsn,
|
||||
#[cfg(test)]
|
||||
Testing,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! An utilization metric which is used to decide on which pageserver to put next tenant.
|
||||
//!
|
||||
//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
|
||||
//! The metric is exposed via `GET /v1/utilization`. Refer and maintain its openapi spec as the
|
||||
//! truth.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
@@ -32,9 +32,10 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{
|
||||
PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
|
||||
enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
|
||||
PgMajorVersion, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
|
||||
fsm_logical_to_physical, pg_constants,
|
||||
};
|
||||
use postgres_ffi_types::TimestampTz;
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::{DeserializeError, SerializeError};
|
||||
@@ -1069,7 +1070,7 @@ impl WalIngest {
|
||||
// NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
|
||||
// go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
|
||||
// read it, like GetNewMultiXactId(). This is different from how nextXid is
|
||||
// incremented! nextXid skips over < FirstNormalTransactionId when the the value
|
||||
// incremented! nextXid skips over < FirstNormalTransactionId when the value
|
||||
// is stored, so it's never 0 in a checkpoint.
|
||||
//
|
||||
// I don't know why it's done that way, it seems less error-prone to skip over 0
|
||||
|
||||
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
|
||||
GcCompaction,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RedoAttemptType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
RedoAttemptType::ReadPage => write!(f, "read page"),
|
||||
RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
|
||||
RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
redo_attempt_type,
|
||||
)
|
||||
.await
|
||||
};
|
||||
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
redo_attempt_type,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: PgMajorVersion,
|
||||
max_retry_attempts: u32,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
@@ -485,17 +498,28 @@ impl PostgresRedoManager {
|
||||
);
|
||||
|
||||
if let Err(e) = result.as_ref() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
base_img_lsn,
|
||||
lsn,
|
||||
n_attempts,
|
||||
e,
|
||||
);
|
||||
macro_rules! message {
|
||||
($level:tt) => {
|
||||
$level!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
key,
|
||||
redo_attempt_type,
|
||||
base_img_lsn,
|
||||
lsn,
|
||||
n_attempts,
|
||||
e,
|
||||
)
|
||||
}
|
||||
}
|
||||
match redo_attempt_type {
|
||||
RedoAttemptType::ReadPage => message!(error),
|
||||
RedoAttemptType::LegacyCompaction => message!(error),
|
||||
RedoAttemptType::GcCompaction => message!(warn),
|
||||
}
|
||||
}
|
||||
|
||||
result.map_err(Error::Other)
|
||||
|
||||
@@ -5,7 +5,6 @@ MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
communicator.o \
|
||||
communicator_new.o \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
@@ -30,11 +29,6 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S), Darwin)
|
||||
SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
|
||||
endif
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = \
|
||||
neon--1.0.sql \
|
||||
@@ -63,7 +57,7 @@ WALPROP_OBJS = \
|
||||
|
||||
# libcommunicator.a is built by cargo from the Rust sources under communicator/
|
||||
# subdirectory. `cargo build` also generates communicator_bindings.h.
|
||||
communicator_new.o: communicator/communicator_bindings.h
|
||||
neon.o: communicator/communicator_bindings.h
|
||||
|
||||
$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
|
||||
(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
|
||||
|
||||
@@ -421,7 +421,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
|
||||
{
|
||||
if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
|
||||
neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "",
|
||||
resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
|
||||
}
|
||||
if (neon_protocol_version >= 3)
|
||||
@@ -438,7 +438,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
|
||||
getpage_resp->req.blkno != slot->buftag.blockNum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
|
||||
"Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
}
|
||||
@@ -447,7 +447,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since)
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
|
||||
}
|
||||
@@ -496,9 +496,9 @@ communicator_prefetch_pump_state(void)
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
slot->my_ring_index, MyPState->ring_receive);
|
||||
}
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
@@ -789,9 +789,9 @@ prefetch_read(PrefetchRequest *slot)
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC,
|
||||
"Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
|
||||
"Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
|
||||
slot->status, slot->response,
|
||||
(long)slot->my_ring_index, (long)MyPState->ring_receive);
|
||||
slot->my_ring_index, MyPState->ring_receive);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -816,9 +816,9 @@ prefetch_read(PrefetchRequest *slot)
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
{
|
||||
neon_shard_log(shard_no, PANIC,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
slot->my_ring_index, MyPState->ring_receive);
|
||||
}
|
||||
|
||||
/* update prefetch state */
|
||||
@@ -852,8 +852,8 @@ prefetch_read(PrefetchRequest *slot)
|
||||
* and the prefetch queue was flushed during the receive call
|
||||
*/
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long) my_ring_index,
|
||||
"No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
|
||||
buftag.forkNum, buftag.blockNum);
|
||||
return false;
|
||||
@@ -1844,7 +1844,7 @@ nm_to_string(NeonMessage *msg)
|
||||
NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
|
||||
appendStringInfo(&s, ", \"db_size\": %ld}",
|
||||
appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}",
|
||||
msg_resp->db_size);
|
||||
appendStringInfoChar(&s, '}');
|
||||
|
||||
@@ -2045,7 +2045,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
|
||||
exists_resp->req.forknum != request.forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
|
||||
}
|
||||
@@ -2058,14 +2058,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
RelFileInfoFmt(rinfo),
|
||||
forkNum,
|
||||
@@ -2241,7 +2241,7 @@ Retry:
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
|
||||
forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2294,7 +2294,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
|
||||
relsize_resp->req.forknum != forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
|
||||
}
|
||||
@@ -2307,14 +2307,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
RelFileInfoFmt(rinfo),
|
||||
forknum,
|
||||
@@ -2364,7 +2364,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
|
||||
dbsize_resp->req.dbNode != dbNode)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
|
||||
"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
|
||||
}
|
||||
@@ -2377,14 +2377,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2455,7 +2455,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
slru_resp->req.segno != segno)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
|
||||
"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
|
||||
}
|
||||
@@ -2469,14 +2469,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
kind,
|
||||
(unsigned long long) segno,
|
||||
|
||||
372
pgxn/neon/communicator/Cargo.lock
generated
372
pgxn/neon/communicator/Cargo.lock
generated
@@ -1,372 +0,0 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "addr2line"
|
||||
version = "0.24.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
|
||||
dependencies = [
|
||||
"gimli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
|
||||
dependencies = [
|
||||
"addr2line",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.31.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body-util"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"http",
|
||||
"http-body",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.171"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
|
||||
dependencies = [
|
||||
"adler2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.44.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-stream"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"tokio-stream",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
@@ -1,42 +1,19 @@
|
||||
[package]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[features]
|
||||
# 'testing' feature is currently unused in the communicator, but we accept it for convenience of
|
||||
# calling build scripts, so that you can pass the same feature to all packages.
|
||||
testing = []
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[dependencies]
|
||||
axum.workspace = true
|
||||
bytes.workspace = true
|
||||
clashmap.workspace = true
|
||||
http.workspace = true
|
||||
libc.workspace = true
|
||||
nix.workspace = true
|
||||
atomic_enum = "0.3.0"
|
||||
prometheus.workspace = true
|
||||
prost.workspace = true
|
||||
tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
|
||||
tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
|
||||
tokio-pipe = { version = "0.2.12" }
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
metrics.workspace = true
|
||||
uring-common = { workspace = true, features = ["bytes"] }
|
||||
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
|
||||
neon-shmem.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
|
||||
@@ -1,138 +1,8 @@
|
||||
# Communicator
|
||||
|
||||
This package provides the so-called "compute-pageserver communicator",
|
||||
or just "communicator" in short. It runs in a PostgreSQL server, as
|
||||
part of the neon extension, and handles the communication with the
|
||||
pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
|
||||
the communicator to implement the PostgreSQL Storage Manager (SMGR)
|
||||
interface.
|
||||
|
||||
## Design criteria
|
||||
|
||||
- Low latency
|
||||
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
|
||||
|
||||
## Source code view
|
||||
|
||||
pgxn/neon/communicator_new.c
|
||||
Contains the glue that interact with PostgreSQL code and the Rust
|
||||
communicator code.
|
||||
|
||||
pgxn/neon/communicator/src/backend_interface.rs
|
||||
The entry point for calls from each backend.
|
||||
|
||||
pgxn/neon/communicator/src/init.rs
|
||||
Initialization at server startup
|
||||
|
||||
pgxn/neon/communicator/src/worker_process/
|
||||
Worker process main loop and glue code
|
||||
This package will evolve into a "compute-pageserver communicator"
|
||||
process and machinery. For now, it's just a dummy that doesn't do
|
||||
anything interesting, but it allows us to test the compilation and
|
||||
linking of Rust code into the Postgres extensions.
|
||||
|
||||
At compilation time, pgxn/neon/communicator/ produces a static
|
||||
library, libcommunicator.a. It is linked to the neon.so extension
|
||||
library.
|
||||
|
||||
The real networking code, which is independent of PostgreSQL, is in
|
||||
the pageserver/client_grpc crate.
|
||||
|
||||
## Process view
|
||||
|
||||
The communicator runs in a dedicated background worker process, the
|
||||
"communicator process". The communicator uses a multi-threaded Tokio
|
||||
runtime to execute the IO requests. So the communicator process has
|
||||
multiple threads running. That's unusual for Postgres processes and
|
||||
care must be taken to make that work.
|
||||
|
||||
### Backend <-> worker communication
|
||||
|
||||
Each backend has a number of I/O request slots in shared memory. The
|
||||
slots are statically allocated for each backend, and must not be
|
||||
accessed by other backends. The worker process reads requests from the
|
||||
shared memory slots, and writes responses back to the slots.
|
||||
|
||||
Here's an example snapshot of the system, when two requests from two
|
||||
different backends are in progress:
|
||||
|
||||
```
|
||||
Backends Request slots Communicator process
|
||||
--------- ------------- --------------------
|
||||
|
||||
Backend 1 1: Idle
|
||||
2: Idle
|
||||
3: Processing tokio task handling request 3
|
||||
|
||||
Backend 2 4: Completed
|
||||
5: Processing tokio task handling request 5
|
||||
6: Idle
|
||||
|
||||
... ...
|
||||
```
|
||||
|
||||
To submit an IO request, the backend first picks one of its Idle
|
||||
slots, writes the IO request in the slot, and updates it to
|
||||
'Submitted' state. That transfers the ownership of the slot to the
|
||||
worker process, until the worker process marks the request as
|
||||
Completed. The worker process spawns a separate Tokio task for each
|
||||
request.
|
||||
|
||||
To inform the worker process that a request slot has a pending IO
|
||||
request, there's a pipe shared by the worker process and all backend
|
||||
processes. The backend writes the index of the request slot to the
|
||||
pipe after changing the slot's state to Submitted. This wakes up the
|
||||
worker process.
|
||||
|
||||
(Note that the pipe is just used for wakeups, but the worker process
|
||||
is free to pick up Submitted IO requests even without receiving the
|
||||
wakeup. As of this writing, it doesn't do that, but it might be useful
|
||||
in the future to reduce latency even further, for example.)
|
||||
|
||||
When the worker process has completed processing the request, it
|
||||
writes the result back in the request slot. A GetPage request can also
|
||||
contain a pointer to buffer in the shared buffer cache. In that case,
|
||||
the worker process writes the resulting page contents directly to the
|
||||
buffer, and just a result code in the request slot. It then updates
|
||||
the 'state' field to Completed, which passes the owner ship back to
|
||||
the originating backend. Finally, it signals the process Latch of the
|
||||
originating backend, waking it up.
|
||||
|
||||
### Differences between PostgreSQL v16, v17 and v18
|
||||
|
||||
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
|
||||
mechanism uses a very similar mechanism as described in the previous
|
||||
section, for the communication between AIO worker processes and
|
||||
backends. With our communicator, the AIO worker processes are not
|
||||
used, but we use the same PgAioHandle request slots as in upstream.
|
||||
For Neon-specific IO requests like GetDbSize, a neon request slot is
|
||||
used. But for the actual IO requests, the request slot merely contains
|
||||
a pointer to the PgAioHandle slot. The worker process updates the
|
||||
status of that, calls the IO callbacks upon completionetc, just like
|
||||
the upstream AIO worker processes do.
|
||||
|
||||
## Sequence diagram
|
||||
|
||||
neon
|
||||
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
|
||||
| . . . .
|
||||
| smgr_read() . . . .
|
||||
+-------------> + . . .
|
||||
. | . . .
|
||||
. | rcommunicator_ . . .
|
||||
. | get_page_at_lsn . . .
|
||||
. +------------------> + . .
|
||||
| . .
|
||||
| write request to . . .
|
||||
| slot . .
|
||||
| . .
|
||||
| . .
|
||||
| submit_request() . .
|
||||
+-----------------> + .
|
||||
| | .
|
||||
| | db_size_request . .
|
||||
+---------------->.
|
||||
. TODO
|
||||
|
||||
|
||||
|
||||
### Compute <-> pageserver protocol
|
||||
|
||||
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user