mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 21:20:37 +00:00
Compare commits
82 Commits
release-88
...
erik/commu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f4d51c0f5c | ||
|
|
ec17ae0658 | ||
|
|
9ecce60ded | ||
|
|
e74a957045 | ||
|
|
396a16a3b2 | ||
|
|
7140a50225 | ||
|
|
68f18ccacf | ||
|
|
786888d93f | ||
|
|
255537dda1 | ||
|
|
8b494f6a24 | ||
|
|
28a61741b3 | ||
|
|
2fb6164bf8 | ||
|
|
328f28dfe5 | ||
|
|
95838056da | ||
|
|
6d451654f1 | ||
|
|
37c58522a2 | ||
|
|
4b6f02e47d | ||
|
|
8202c6172f | ||
|
|
69a47d789d | ||
|
|
b36f880710 | ||
|
|
745b750f33 | ||
|
|
f06bb2bbd8 | ||
|
|
b3c25418a6 | ||
|
|
33549bad1d | ||
|
|
009168d711 | ||
|
|
7c9bd542a6 | ||
|
|
014823b305 | ||
|
|
af9379ccf6 | ||
|
|
bb28109ffa | ||
|
|
60a0bec1c0 | ||
|
|
31fa7a545d | ||
|
|
ac464c5f2c | ||
|
|
0dddb1e373 | ||
|
|
3acb263e62 | ||
|
|
1e83398cdd | ||
|
|
be8ed81532 | ||
|
|
12b08c4b82 | ||
|
|
827358dd03 | ||
|
|
d367273000 | ||
|
|
e2bad5d9e9 | ||
|
|
5623e4665b | ||
|
|
8abb4dab6d | ||
|
|
731667ac37 | ||
|
|
6a1374d106 | ||
|
|
f7c908f2f0 | ||
|
|
86671e3a0b | ||
|
|
319cd74f73 | ||
|
|
0efefbf77c | ||
|
|
e6a4171fa1 | ||
|
|
0c25ea9e31 | ||
|
|
6692321026 | ||
|
|
791df28755 | ||
|
|
d20da994f4 | ||
|
|
6dbbdaae73 | ||
|
|
977bc09d2a | ||
|
|
44269fcd5e | ||
|
|
44cc648dc8 | ||
|
|
884e028a4a | ||
|
|
42df3e5453 | ||
|
|
fc743e284f | ||
|
|
d02f9a2139 | ||
|
|
083118e98e | ||
|
|
54cd2272f1 | ||
|
|
e40193e3c8 | ||
|
|
ce9f7bacc1 | ||
|
|
b7891f8fe8 | ||
|
|
5f2adaa9ad | ||
|
|
3e5e396c8d | ||
|
|
9d781c6fda | ||
|
|
cf5d038472 | ||
|
|
d785100c02 | ||
|
|
2c0d930e3d | ||
|
|
66171a117b | ||
|
|
df2806e7a0 | ||
|
|
07631692db | ||
|
|
4c77397943 | ||
|
|
7bb58be546 | ||
|
|
b5373de208 | ||
|
|
b86c610f42 | ||
|
|
0f520d79ab | ||
|
|
93eb7bb6b8 | ||
|
|
e58d0fece1 |
11
.github/workflows/_build-and-test-locally.yml
vendored
11
.github/workflows/_build-and-test-locally.yml
vendored
@@ -38,11 +38,6 @@ on:
|
||||
required: false
|
||||
default: 1
|
||||
type: number
|
||||
rerun-failed:
|
||||
description: 'rerun failed tests to ignore flaky tests'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -384,7 +379,7 @@ jobs:
|
||||
- name: Pytest regression tests
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }}
|
||||
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
@@ -392,14 +387,14 @@ jobs:
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_failed: ${{ inputs.rerun-failed }}
|
||||
rerun_failed: ${{ inputs.test-run-count == 1 }}
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
sanitizers: ${{ inputs.sanitizers }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
|
||||
# Attempt to stop tests gracefully to generate test reports
|
||||
# until they are forcibly stopped by the stricter `timeout-minutes` limit.
|
||||
extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
|
||||
extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
|
||||
${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
@@ -58,7 +58,6 @@ jobs:
|
||||
test-cfg: ${{ inputs.pg-versions }}
|
||||
test-selection: ${{ inputs.test-selection }}
|
||||
test-run-count: ${{ fromJson(inputs.run-count) }}
|
||||
rerun-failed: false
|
||||
secrets: inherit
|
||||
|
||||
create-test-report:
|
||||
|
||||
22
.github/workflows/build_and_test.yml
vendored
22
.github/workflows/build_and_test.yml
vendored
@@ -199,28 +199,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
validate-compute-manifest:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
||||
with:
|
||||
node-version: '24'
|
||||
|
||||
- name: Validate manifest against schema
|
||||
run: |
|
||||
make -C compute manifest-schema-validation
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ meta, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
|
||||
151
.github/workflows/build_and_test_fully.yml
vendored
151
.github/workflows/build_and_test_fully.yml
vendored
@@ -1,151 +0,0 @@
|
||||
name: Build and Test Fully
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
echo run:$GITHUB_RUN_ID
|
||||
echo ref:$GITHUB_REF_NAME
|
||||
echo rev:$(git rev-list --count HEAD)
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-build-tools-image:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
build-type: [ debug, release ]
|
||||
uses: ./.github/workflows/_build-and-test-locally.yml
|
||||
with:
|
||||
arch: ${{ matrix.arch }}
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
rerun-failed: false
|
||||
test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v14", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "withouts-lfc"}]'
|
||||
secrets: inherit
|
||||
|
||||
|
||||
create-test-report:
|
||||
needs: [ build-and-test-locally, build-build-tools-image ]
|
||||
if: ${{ !cancelled() }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const report = {
|
||||
reportUrl: "${{ steps.create-allure-report.outputs.report-url }}",
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
|
||||
const coverage = {}
|
||||
|
||||
const script = require("./scripts/comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
report,
|
||||
coverage,
|
||||
})
|
||||
@@ -79,7 +79,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
rerun-failed: false
|
||||
test-cfg: '[{"pg_version":"v17"}]'
|
||||
sanitizers: enabled
|
||||
secrets: inherit
|
||||
|
||||
11
.github/workflows/large_oltp_benchmark.yml
vendored
11
.github/workflows/large_oltp_benchmark.yml
vendored
@@ -33,19 +33,11 @@ jobs:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
include:
|
||||
# test only read-only custom scripts in new branch without database maintenance
|
||||
- target: new_branch
|
||||
custom_scripts: select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3
|
||||
test_maintenance: false
|
||||
# test all custom scripts in new branch with database maintenance
|
||||
- target: new_branch
|
||||
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
|
||||
test_maintenance: true
|
||||
# test all custom scripts in reuse branch with database maintenance
|
||||
- target: reuse_branch
|
||||
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
|
||||
test_maintenance: true
|
||||
max-parallel: 1 # we want to run each benchmark sequentially to not have noisy neighbors on shared storage (PS, SK)
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
@@ -153,7 +145,6 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Benchmark database maintenance
|
||||
if: ${{ matrix.test_maintenance == 'true' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
175
.github/workflows/large_oltp_growth.yml
vendored
175
.github/workflows/large_oltp_growth.yml
vendored
@@ -1,175 +0,0 @@
|
||||
name: large oltp growth
|
||||
# workflow to grow the reuse branch of large oltp benchmark continuously (about 16 GB per run)
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
# push:
|
||||
# branches: [ bodobolero/increase_large_oltp_workload ]
|
||||
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 6 * * *' # 06:00 UTC
|
||||
- cron: '0 8 * * *' # 08:00 UTC
|
||||
- cron: '0 10 * * *' # 10:00 UTC
|
||||
- cron: '0 12 * * *' # 12:00 UTC
|
||||
- cron: '0 14 * * *' # 14:00 UTC
|
||||
- cron: '0 16 * * *' # 16:00 UTC
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||
group: large-oltp-growth
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
oltp:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
include:
|
||||
# for now only grow the reuse branch, not the other branches.
|
||||
- target: reuse_branch
|
||||
custom_scripts:
|
||||
- grow_action_blocks.sql
|
||||
- grow_action_kwargs.sql
|
||||
- grow_device_fingerprint_event.sql
|
||||
- grow_edges.sql
|
||||
- grow_hotel_rate_mapping.sql
|
||||
- grow_ocr_pipeline_results_version.sql
|
||||
- grow_priceline_raw_response.sql
|
||||
- grow_relabled_transactions.sql
|
||||
- grow_state_values.sql
|
||||
- grow_values.sql
|
||||
- grow_vertices.sql
|
||||
- update_accounting_coding_body_tracking_category_selection.sql
|
||||
- update_action_blocks.sql
|
||||
- update_action_kwargs.sql
|
||||
- update_denormalized_approval_workflow.sql
|
||||
- update_device_fingerprint_event.sql
|
||||
- update_edges.sql
|
||||
- update_heron_transaction_enriched_log.sql
|
||||
- update_heron_transaction_enrichment_requests.sql
|
||||
- update_hotel_rate_mapping.sql
|
||||
- update_incoming_webhooks.sql
|
||||
- update_manual_transaction.sql
|
||||
- update_ml_receipt_matching_log.sql
|
||||
- update_ocr_pipeine_results_version.sql
|
||||
- update_orc_pipeline_step_results.sql
|
||||
- update_orc_pipeline_step_results_version.sql
|
||||
- update_priceline_raw_response.sql
|
||||
- update_quickbooks_transactions.sql
|
||||
- update_raw_finicity_transaction.sql
|
||||
- update_relabeled_transactions.sql
|
||||
- update_state_values.sql
|
||||
- update_stripe_authorization_event_log.sql
|
||||
- update_transaction.sql
|
||||
- update_values.sql
|
||||
- update_vertices.sql
|
||||
max-parallel: 1 # we want to run each growth workload sequentially (for now there is just one)
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "1h"
|
||||
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ join(matrix.custom_scripts, ' ') }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
PG_VERSION: 16 # pre-determined by pre-determined project
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
PLATFORM: ${{ matrix.target }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials # necessary to download artefacts
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${{ matrix.target }}" in
|
||||
reuse_branch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: pgbench with custom-scripts
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_growth
|
||||
pg_version: ${{ env.PG_VERSION }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Periodic large oltp tenant growth increase: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -13,6 +13,7 @@ neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
pgxn/neon/communicator/communicator_bindings.h
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
437
Cargo.lock
generated
437
Cargo.lock
generated
@@ -253,6 +253,17 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_enum"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -687,13 +698,40 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core 0.4.5",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"itoa",
|
||||
"matchit 0.7.3",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"axum-core",
|
||||
"axum-core 0.5.0",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
@@ -701,10 +739,10 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -724,6 +762,26 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.5.0"
|
||||
@@ -750,10 +808,9 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
"futures-util",
|
||||
"headers",
|
||||
"http 1.1.0",
|
||||
@@ -762,8 +819,6 @@ dependencies = [
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"serde_html_form",
|
||||
"serde_path_to_error",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -903,6 +958,12 @@ version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.7"
|
||||
@@ -1083,6 +1144,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck 0.4.1",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1209,7 +1289,7 @@ version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -1267,6 +1347,34 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"atomic_enum",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"clashmap",
|
||||
"http 1.1.0",
|
||||
"libc",
|
||||
"metrics",
|
||||
"neon-shmem",
|
||||
"nix 0.30.1",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"prometheus",
|
||||
"prost 0.13.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-pipe",
|
||||
"tonic 0.12.3",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uring-common",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
@@ -1292,9 +1400,9 @@ dependencies = [
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"base64 0.22.1",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"cfg-if",
|
||||
@@ -1316,6 +1424,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres_initdb",
|
||||
"regex",
|
||||
@@ -1334,6 +1443,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tower-otel",
|
||||
@@ -1420,7 +1530,7 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
"base64 0.13.1",
|
||||
"camino",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
@@ -1442,7 +1552,6 @@ dependencies = [
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"safekeeper_client",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1592,9 +1701,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.19"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
@@ -1934,7 +2043,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -2048,11 +2157,10 @@ name = "endpoint_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"clap",
|
||||
"futures",
|
||||
"http-body-util",
|
||||
"itertools 0.10.5",
|
||||
@@ -2329,7 +2437,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
@@ -2499,6 +2607,18 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.2+wasi-0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
@@ -2711,6 +2831,12 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -2882,9 +3008,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.8.0"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
|
||||
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@@ -2934,9 +3060,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.4.1"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
|
||||
checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
@@ -2976,7 +3102,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
@@ -2991,7 +3117,7 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
|
||||
dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
@@ -3000,20 +3126,20 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.7"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
|
||||
checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"libc",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower 0.4.13",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
@@ -3602,6 +3728,12 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3647,7 +3779,7 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -3709,7 +3841,7 @@ dependencies = [
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
@@ -3798,11 +3930,25 @@ name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"nix 0.30.1",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"spin",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"spin",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -4236,15 +4382,19 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
@@ -4327,6 +4477,7 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"pageserver_compaction",
|
||||
"pageserver_page_api",
|
||||
"peekable",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -4334,12 +4485,12 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"postgres_initdb",
|
||||
"posthog_client_lite",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"prost 0.13.5",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
@@ -4404,7 +4555,7 @@ dependencies = [
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"postgres_backend",
|
||||
"postgres_ffi_types",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
@@ -4439,6 +4590,34 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"dashmap 5.5.0",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"priority-queue",
|
||||
"rand 0.8.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.4.13",
|
||||
"tracing",
|
||||
"utils",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -4466,14 +4645,11 @@ dependencies = [
|
||||
name = "pageserver_page_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"prost 0.13.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tonic 0.13.1",
|
||||
"tonic-build",
|
||||
"utils",
|
||||
@@ -4606,6 +4782,15 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peekable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -4816,7 +5001,7 @@ dependencies = [
|
||||
name = "postgres-protocol2"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4893,7 +5078,6 @@ dependencies = [
|
||||
"memoffset 0.9.0",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi_types",
|
||||
"pprof",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -4902,14 +5086,6 @@ dependencies = [
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_ffi_types"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_initdb"
|
||||
version = "0.1.0"
|
||||
@@ -5026,6 +5202,17 @@ dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "priority-queue"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef08705fa1589a1a59aa924ad77d14722cb0cd97b67dd5004ed5f4a4873fce8d"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"equivalent",
|
||||
"indexmap 2.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
@@ -5104,7 +5291,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5125,7 +5312,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5197,7 +5384,7 @@ dependencies = [
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.22.1",
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -5226,7 +5413,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"ipnet",
|
||||
@@ -5250,7 +5437,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -5354,6 +5541,12 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5378,6 +5571,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -5398,6 +5601,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -5416,6 +5629,15 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
@@ -5426,6 +5648,16 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -5622,7 +5854,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-types",
|
||||
"humantime-serde",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -5662,7 +5894,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-rustls 0.26.0",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
@@ -5719,7 +5951,7 @@ dependencies = [
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -5740,7 +5972,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6432,19 +6664,6 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_html_form"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"indexmap 2.9.0",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.125"
|
||||
@@ -6501,17 +6720,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_with"
|
||||
version = "3.12.0"
|
||||
version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa"
|
||||
checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"base64 0.13.1",
|
||||
"chrono",
|
||||
"hex",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.9.0",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"serde_with_macros",
|
||||
"time",
|
||||
@@ -6519,9 +6736,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_with_macros"
|
||||
version = "3.12.0"
|
||||
version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e"
|
||||
checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
@@ -6675,12 +6892,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.5.5"
|
||||
version = "0.5.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
||||
checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6688,6 +6905,9 @@ name = "spin"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
@@ -6746,7 +6966,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -6927,7 +7147,7 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -7352,6 +7572,16 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-pipe"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
@@ -7546,16 +7776,25 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum 0.7.9",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.13.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tower 0.4.13",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -7568,7 +7807,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"flate2",
|
||||
@@ -7576,7 +7815,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
@@ -7591,7 +7830,6 @@ dependencies = [
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7629,11 +7867,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"indexmap 1.9.3",
|
||||
"pin-project",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8117,7 +8360,7 @@ name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -8171,7 +8414,6 @@ dependencies = [
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"pprof",
|
||||
"prost 0.13.5",
|
||||
"remote_storage",
|
||||
@@ -8229,6 +8471,15 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
@@ -8586,6 +8837,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
@@ -8593,8 +8853,9 @@ dependencies = [
|
||||
"ahash",
|
||||
"anstream",
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
@@ -8626,7 +8887,7 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"itertools 0.12.1",
|
||||
|
||||
26
Cargo.toml
26
Cargo.toml
@@ -8,6 +8,7 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
@@ -22,7 +23,6 @@ members = [
|
||||
"libs/http-utils",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
"libs/postgres_ffi_types",
|
||||
"libs/safekeeper_api",
|
||||
"libs/desim",
|
||||
"libs/neon-shmem",
|
||||
@@ -33,6 +33,7 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -45,6 +46,7 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -72,8 +74,8 @@ aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.8.1", features = ["ws"] }
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header", "query"] }
|
||||
base64 = "0.22"
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.71"
|
||||
bit_field = "0.10.2"
|
||||
@@ -88,6 +90,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
|
||||
clashmap = { version = "1.0", features = ["raw-api"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crossbeam-utils = "0.8.21"
|
||||
crc32c = "0.6"
|
||||
diatomic-waker = { version = "0.2.3" }
|
||||
either = "1.8"
|
||||
@@ -146,6 +149,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -172,7 +176,7 @@ sentry = { version = "0.37", default-features = false, features = ["backtrace",
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_path_to_error = "0.1"
|
||||
serde_with = { version = "3", features = [ "base64" ] }
|
||||
serde_with = { version = "2.0", features = [ "base64" ] }
|
||||
serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
@@ -180,6 +184,7 @@ smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
spki = "0.7.3"
|
||||
spin = "0.9.8"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
"subtle" = "2.5.0"
|
||||
@@ -191,16 +196,15 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
|
||||
tonic-reflection = { version = "0.13.1", features = ["server"] }
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
@@ -233,6 +237,9 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
@@ -252,15 +259,17 @@ desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neonart = { version = "0.1", path = "./libs/neonart/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
pageserver_page_api = { path = "./pageserver/page_api" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" }
|
||||
postgres_initdb = { path = "./libs/postgres_initdb" }
|
||||
posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
@@ -280,6 +289,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
21
Dockerfile
21
Dockerfile
@@ -5,6 +5,8 @@
|
||||
ARG REPOSITORY=ghcr.io/neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG DEFAULT_PG_VERSION=17
|
||||
ARG STABLE_PG_VERSION=16
|
||||
ARG DEBIAN_VERSION=bookworm
|
||||
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
|
||||
|
||||
@@ -61,11 +63,14 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
ARG STABLE_PG_VERSION
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS=""
|
||||
@@ -92,6 +97,7 @@ RUN set -e \
|
||||
# Build final image
|
||||
#
|
||||
FROM $BASE_IMAGE_SHA
|
||||
ARG DEFAULT_PG_VERSION
|
||||
WORKDIR /data
|
||||
|
||||
RUN set -e \
|
||||
@@ -101,20 +107,9 @@ RUN set -e \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
ca-certificates \
|
||||
# System postgres for use with client libraries (e.g. in storage controller)
|
||||
postgresql-15 \
|
||||
openssl \
|
||||
unzip \
|
||||
curl \
|
||||
&& ARCH=$(uname -m) \
|
||||
&& if [ "$ARCH" = "x86_64" ]; then \
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
|
||||
elif [ "$ARCH" = "aarch64" ]; then \
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $ARCH" && exit 1; \
|
||||
fi \
|
||||
&& unzip awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm -rf aws awscliv2.zip \
|
||||
&& rm -f /etc/apt/apt.conf.d/80-retries \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data neon \
|
||||
|
||||
55
Makefile
55
Makefile
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -167,17 +169,29 @@ postgres-%: postgres-configure-% \
|
||||
+@echo "Compiling test_decoding $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
|
||||
|
||||
.PHONY: postgres-clean-%
|
||||
postgres-clean-%:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
|
||||
|
||||
.PHONY: postgres-check-%
|
||||
postgres-check-%: postgres-%
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
@@ -199,6 +213,21 @@ neon-pg-ext-%: postgres-%
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
.PHONY: neon-pg-clean-ext-%
|
||||
neon-pg-clean-ext-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
@@ -231,6 +260,12 @@ ifeq ($(UNAME_S),Linux)
|
||||
pg_crc32c.o
|
||||
endif
|
||||
|
||||
.PHONY: walproposer-lib-clean
|
||||
walproposer-lib-clean:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
@@ -238,6 +273,13 @@ neon-pg-ext: \
|
||||
neon-pg-ext-v16 \
|
||||
neon-pg-ext-v17
|
||||
|
||||
.PHONY: neon-pg-clean-ext
|
||||
neon-pg-clean-ext: \
|
||||
neon-pg-clean-ext-v14 \
|
||||
neon-pg-clean-ext-v15 \
|
||||
neon-pg-clean-ext-v16 \
|
||||
neon-pg-clean-ext-v17
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
postgres: \
|
||||
@@ -253,6 +295,13 @@ postgres-headers: \
|
||||
postgres-headers-v16 \
|
||||
postgres-headers-v17
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean: \
|
||||
postgres-clean-v14 \
|
||||
postgres-clean-v15 \
|
||||
postgres-clean-v16 \
|
||||
postgres-clean-v17
|
||||
|
||||
.PHONY: postgres-check
|
||||
postgres-check: \
|
||||
postgres-check-v14 \
|
||||
@@ -260,6 +309,12 @@ postgres-check: \
|
||||
postgres-check-v16 \
|
||||
postgres-check-v17
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean: postgres-clean neon-pg-clean-ext
|
||||
$(MAKE) -C compute clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
|
||||
3
compute/.gitignore
vendored
3
compute/.gitignore
vendored
@@ -3,6 +3,3 @@ etc/neon_collector.yml
|
||||
etc/neon_collector_autoscaling.yml
|
||||
etc/sql_exporter.yml
|
||||
etc/sql_exporter_autoscaling.yml
|
||||
|
||||
# Node.js dependencies
|
||||
node_modules/
|
||||
|
||||
@@ -48,11 +48,3 @@ jsonnetfmt-test:
|
||||
.PHONY: jsonnetfmt-format
|
||||
jsonnetfmt-format:
|
||||
jsonnetfmt --in-place $(jsonnet_files)
|
||||
|
||||
.PHONY: manifest-schema-validation
|
||||
manifest-schema-validation: node_modules
|
||||
node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
|
||||
|
||||
node_modules: package.json
|
||||
npm install
|
||||
touch node_modules
|
||||
|
||||
@@ -149,10 +149,8 @@ RUN case $DEBIAN_VERSION in \
|
||||
ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
|
||||
libclang-dev \
|
||||
$VERSION_INSTALLS \
|
||||
&& apt clean && rm -rf /var/lib/apt/lists/* && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
&& apt clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1059,10 +1057,17 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps with Rust toolchain installed"
|
||||
# Layer "pg build with nonroot user and cargo installed"
|
||||
# This layer is base and common for layers with `pgrx`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS build-deps-with-cargo
|
||||
FROM pg-build AS pg-build-nonroot-with-cargo
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
apt clean && rm -rf /var/lib/apt/lists/* && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
@@ -1077,29 +1082,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build with Rust toolchain installed"
|
||||
# This layer is base and common for layers with `pgrx`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-build-with-cargo
|
||||
ARG PG_VERSION
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
COPY --from=build-deps-with-cargo /home/nonroot /home/nonroot
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-with-cargo AS rust-extensions-build
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
@@ -1121,7 +1110,7 @@ USER root
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-with-cargo AS rust-extensions-build-pgrx12
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
@@ -1138,7 +1127,7 @@ USER root
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-with-cargo AS rust-extensions-build-pgrx14
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
|
||||
@@ -1155,12 +1144,10 @@ USER root
|
||||
|
||||
FROM build-deps AS pgrag-src
|
||||
ARG PG_VERSION
|
||||
WORKDIR /ext-src
|
||||
COPY compute/patches/onnxruntime.patch .
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /ext-src/onnxruntime.patch && \
|
||||
echo "#nothing to test here" > neon-test.sh
|
||||
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \
|
||||
|
||||
@@ -21,8 +21,6 @@ unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
; required for pgbouncer_exporter
|
||||
ignore_startup_parameters=extra_float_digits
|
||||
; pidfile for graceful termination
|
||||
pidfile=/tmp/pgbouncer.pid
|
||||
|
||||
;; Disable connection logging. It produces a lot of logs that no one looks at,
|
||||
;; and we can get similar log entries from the proxy too. We had incidents in
|
||||
|
||||
@@ -1,209 +0,0 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "Neon Compute Manifest Schema",
|
||||
"description": "Schema for Neon compute node configuration manifest",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pg_settings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"common": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"client_connection_check_interval": {
|
||||
"type": "string",
|
||||
"description": "Check for client disconnection interval in milliseconds"
|
||||
},
|
||||
"effective_io_concurrency": {
|
||||
"type": "string",
|
||||
"description": "Effective IO concurrency setting"
|
||||
},
|
||||
"fsync": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to force fsync to disk"
|
||||
},
|
||||
"hot_standby": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether hot standby is enabled"
|
||||
},
|
||||
"idle_in_transaction_session_timeout": {
|
||||
"type": "string",
|
||||
"description": "Timeout for idle transactions in milliseconds"
|
||||
},
|
||||
"listen_addresses": {
|
||||
"type": "string",
|
||||
"description": "Addresses to listen on"
|
||||
},
|
||||
"log_connections": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log connections"
|
||||
},
|
||||
"log_disconnections": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log disconnections"
|
||||
},
|
||||
"log_temp_files": {
|
||||
"type": "string",
|
||||
"description": "Size threshold for logging temporary files in KB"
|
||||
},
|
||||
"log_error_verbosity": {
|
||||
"type": "string",
|
||||
"enum": ["terse", "verbose", "default"],
|
||||
"description": "Error logging verbosity level"
|
||||
},
|
||||
"log_min_error_statement": {
|
||||
"type": "string",
|
||||
"description": "Minimum error level for statement logging"
|
||||
},
|
||||
"maintenance_io_concurrency": {
|
||||
"type": "string",
|
||||
"description": "Maintenance IO concurrency setting"
|
||||
},
|
||||
"max_connections": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of connections"
|
||||
},
|
||||
"max_replication_flush_lag": {
|
||||
"type": "string",
|
||||
"description": "Maximum replication flush lag"
|
||||
},
|
||||
"max_replication_slots": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of replication slots"
|
||||
},
|
||||
"max_replication_write_lag": {
|
||||
"type": "string",
|
||||
"description": "Maximum replication write lag"
|
||||
},
|
||||
"max_wal_senders": {
|
||||
"type": "string",
|
||||
"description": "Maximum number of WAL senders"
|
||||
},
|
||||
"max_wal_size": {
|
||||
"type": "string",
|
||||
"description": "Maximum WAL size"
|
||||
},
|
||||
"neon.unstable_extensions": {
|
||||
"type": "string",
|
||||
"description": "List of unstable extensions"
|
||||
},
|
||||
"neon.protocol_version": {
|
||||
"type": "string",
|
||||
"description": "Neon protocol version"
|
||||
},
|
||||
"password_encryption": {
|
||||
"type": "string",
|
||||
"description": "Password encryption method"
|
||||
},
|
||||
"restart_after_crash": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to restart after crash"
|
||||
},
|
||||
"superuser_reserved_connections": {
|
||||
"type": "string",
|
||||
"description": "Number of reserved connections for superuser"
|
||||
},
|
||||
"synchronous_standby_names": {
|
||||
"type": "string",
|
||||
"description": "Names of synchronous standby servers"
|
||||
},
|
||||
"wal_keep_size": {
|
||||
"type": "string",
|
||||
"description": "WAL keep size"
|
||||
},
|
||||
"wal_level": {
|
||||
"type": "string",
|
||||
"description": "WAL level"
|
||||
},
|
||||
"wal_log_hints": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to log hints in WAL"
|
||||
},
|
||||
"wal_sender_timeout": {
|
||||
"type": "string",
|
||||
"description": "WAL sender timeout in milliseconds"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"client_connection_check_interval",
|
||||
"effective_io_concurrency",
|
||||
"fsync",
|
||||
"hot_standby",
|
||||
"idle_in_transaction_session_timeout",
|
||||
"listen_addresses",
|
||||
"log_connections",
|
||||
"log_disconnections",
|
||||
"log_temp_files",
|
||||
"log_error_verbosity",
|
||||
"log_min_error_statement",
|
||||
"maintenance_io_concurrency",
|
||||
"max_connections",
|
||||
"max_replication_flush_lag",
|
||||
"max_replication_slots",
|
||||
"max_replication_write_lag",
|
||||
"max_wal_senders",
|
||||
"max_wal_size",
|
||||
"neon.unstable_extensions",
|
||||
"neon.protocol_version",
|
||||
"password_encryption",
|
||||
"restart_after_crash",
|
||||
"superuser_reserved_connections",
|
||||
"synchronous_standby_names",
|
||||
"wal_keep_size",
|
||||
"wal_level",
|
||||
"wal_log_hints",
|
||||
"wal_sender_timeout"
|
||||
]
|
||||
},
|
||||
"replica": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hot_standby": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether hot standby is enabled for replicas"
|
||||
}
|
||||
},
|
||||
"required": ["hot_standby"]
|
||||
},
|
||||
"per_version": {
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"^1[4-7]$": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"common": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"io_combine_limit": {
|
||||
"type": "string",
|
||||
"description": "IO combine limit"
|
||||
}
|
||||
}
|
||||
},
|
||||
"replica": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"recovery_prefetch": {
|
||||
"type": "string",
|
||||
"enum": ["on", "off"],
|
||||
"description": "Whether to enable recovery prefetch for PostgreSQL replicas"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["common", "replica", "per_version"]
|
||||
}
|
||||
},
|
||||
"required": ["pg_settings"]
|
||||
}
|
||||
@@ -105,17 +105,17 @@ pg_settings:
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
16:
|
||||
common: {}
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
15:
|
||||
common: {}
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
14:
|
||||
common: {}
|
||||
replica: {}
|
||||
common:
|
||||
replica:
|
||||
|
||||
37
compute/package-lock.json
generated
37
compute/package-lock.json
generated
@@ -1,37 +0,0 @@
|
||||
{
|
||||
"name": "neon-compute",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "neon-compute",
|
||||
"dependencies": {
|
||||
"@sourcemeta/jsonschema": "9.3.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@sourcemeta/jsonschema": {
|
||||
"version": "9.3.4",
|
||||
"resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-9.3.4.tgz",
|
||||
"integrity": "sha512-hkujfkZAIGXUs4U//We9faZW8LZ4/H9LqagRYsFSulH/VLcKPNhZyCTGg7AhORuzm27zqENvKpnX4g2FzudYFw==",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
],
|
||||
"license": "AGPL-3.0",
|
||||
"os": [
|
||||
"darwin",
|
||||
"linux",
|
||||
"win32"
|
||||
],
|
||||
"bin": {
|
||||
"jsonschema": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sourcemeta"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"name": "neon-compute",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@sourcemeta/jsonschema": "9.3.4"
|
||||
}
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
diff --git a/cmake/deps.txt b/cmake/deps.txt
|
||||
index d213b09034..229de2ebf0 100644
|
||||
--- a/cmake/deps.txt
|
||||
+++ b/cmake/deps.txt
|
||||
@@ -22,7 +22,9 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
|
||||
# it contains changes on top of 3.4.0 which are required to fix build issues.
|
||||
# Until the 3.4.1 release this is the best option we have.
|
||||
# Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
|
||||
-eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
|
||||
+# Moved to github mirror to avoid gitlab issues.Add commentMore actions
|
||||
+# Issue link: https://github.com/bazelbuild/bazel-central-registry/issues/4355
|
||||
+eigen;https://github.com/eigen-mirror/eigen/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;61418a349000ba7744a3ad03cf5071f22ebf860a
|
||||
flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
|
||||
fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
|
||||
fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
|
||||
@@ -38,6 +38,7 @@ once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
p256 = { version = "0.13", features = ["pem"] }
|
||||
pageserver_page_api.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
|
||||
@@ -124,10 +124,6 @@ struct Cli {
|
||||
/// Interval in seconds for collecting installed extensions statistics
|
||||
#[arg(long, default_value = "3600")]
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
|
||||
/// Run in development mode, skipping VM-specific operations like process termination
|
||||
#[arg(long, action = clap::ArgAction::SetTrue)]
|
||||
pub dev: bool,
|
||||
}
|
||||
|
||||
impl Cli {
|
||||
@@ -163,7 +159,7 @@ fn main() -> Result<()> {
|
||||
.build()?;
|
||||
let _rt_guard = runtime.enter();
|
||||
|
||||
runtime.block_on(init(cli.dev))?;
|
||||
runtime.block_on(init())?;
|
||||
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
@@ -202,13 +198,13 @@ fn main() -> Result<()> {
|
||||
deinit_and_exit(exit_code);
|
||||
}
|
||||
|
||||
async fn init(dev_mode: bool) -> Result<()> {
|
||||
async fn init() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
handle_exit_signal(sig, dev_mode);
|
||||
handle_exit_signal(sig);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -267,9 +263,9 @@ fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32, dev_mode: bool) {
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
forward_termination_signal(dev_mode);
|
||||
forward_termination_signal();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
@@ -15,6 +15,7 @@ use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
@@ -29,13 +30,14 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::spawn;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
use utils::pid_file;
|
||||
|
||||
use crate::configurator::launch_configurator;
|
||||
use crate::disk_quota::set_disk_quota;
|
||||
@@ -45,7 +47,6 @@ use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use crate::metrics::COMPUTE_CTL_UP;
|
||||
use crate::monitor::launch_monitor;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::pgbouncer::*;
|
||||
use crate::rsyslog::{
|
||||
PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export,
|
||||
launch_pgaudit_gc,
|
||||
@@ -163,10 +164,6 @@ pub struct ComputeState {
|
||||
pub lfc_prewarm_state: LfcPrewarmState,
|
||||
pub lfc_offload_state: LfcOffloadState,
|
||||
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
/// mode == ComputeMode::Primary. None otherwise
|
||||
pub terminate_flush_lsn: Option<Lsn>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
|
||||
@@ -182,7 +179,6 @@ impl ComputeState {
|
||||
metrics: ComputeMetrics::default(),
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -222,46 +218,6 @@ pub struct ParsedSpec {
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
}
|
||||
|
||||
impl ParsedSpec {
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
// Only Primary nodes are using safekeeper_connstrings, and at the moment
|
||||
// this method only validates that part of the specs.
|
||||
if self.spec.mode != ComputeMode::Primary {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// While it seems like a good idea to check for an odd number of entries in
|
||||
// the safekeepers connection string, changes to the list of safekeepers might
|
||||
// incur appending a new server to a list of 3, in which case a list of 4
|
||||
// entries is okay in production.
|
||||
//
|
||||
// Still we want unique entries, and at least one entry in the vector
|
||||
if self.safekeeper_connstrings.is_empty() {
|
||||
return Err(String::from("safekeeper_connstrings is empty"));
|
||||
}
|
||||
|
||||
// check for uniqueness of the connection strings in the set
|
||||
let mut connstrings = self.safekeeper_connstrings.clone();
|
||||
|
||||
connstrings.sort();
|
||||
let mut previous = &connstrings[0];
|
||||
|
||||
for current in connstrings.iter().skip(1) {
|
||||
// duplicate entry?
|
||||
if current == previous {
|
||||
return Err(format!(
|
||||
"duplicate entry in safekeeper_connstrings: {}!",
|
||||
current,
|
||||
));
|
||||
}
|
||||
|
||||
previous = current;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
@@ -291,7 +247,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
} else {
|
||||
spec.safekeeper_connstrings.clone()
|
||||
};
|
||||
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
@@ -326,7 +281,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token"));
|
||||
|
||||
let res = ParsedSpec {
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
safekeeper_connstrings,
|
||||
@@ -335,11 +290,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
timeline_id,
|
||||
endpoint_storage_addr,
|
||||
endpoint_storage_token,
|
||||
};
|
||||
|
||||
// Now check validity of the parsed specification
|
||||
res.validate()?;
|
||||
Ok(res)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -406,6 +357,11 @@ impl ComputeNode {
|
||||
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
|
||||
// Unset them via connection string options before connecting to the database.
|
||||
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
|
||||
//
|
||||
// TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
|
||||
// as well. After rolling out this code, we can remove this parameter from control plane.
|
||||
// In the meantime, double-passing is fine, the last value is applied.
|
||||
// See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
let options = match conn_conf.get_options() {
|
||||
Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
|
||||
@@ -416,7 +372,7 @@ impl ComputeNode {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -536,21 +492,12 @@ impl ComputeNode {
|
||||
// Reap the postgres process
|
||||
delay_exit |= this.cleanup_after_postgres_exit()?;
|
||||
|
||||
// /terminate returns LSN. If we don't sleep at all, connection will break and we
|
||||
// won't get result. If we sleep too much, tests will take significantly longer
|
||||
// and Github Action run will error out
|
||||
let sleep_duration = if delay_exit {
|
||||
Duration::from_secs(30)
|
||||
} else {
|
||||
Duration::from_millis(300)
|
||||
};
|
||||
|
||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||
// control plane can get the actual error.
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
std::thread::sleep(Duration::from_secs(30));
|
||||
}
|
||||
std::thread::sleep(sleep_duration);
|
||||
Ok(exit_code)
|
||||
}
|
||||
|
||||
@@ -841,7 +788,7 @@ impl ComputeNode {
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.autoprewarm {
|
||||
self.prewarm_lfc(None);
|
||||
self.prewarm_lfc();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -922,25 +869,20 @@ impl ComputeNode {
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let lsn = if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
|
||||
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
|
||||
info!("syncing safekeepers on shutdown");
|
||||
let storage_auth_token = pspec.storage_auth_token.clone();
|
||||
let lsn = self.sync_safekeepers(storage_auth_token)?;
|
||||
info!(%lsn, "synced safekeepers");
|
||||
Some(lsn)
|
||||
} else {
|
||||
info!("not primary, not syncing safekeepers");
|
||||
None
|
||||
};
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut delay_exit = false;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.terminate_flush_lsn = lsn;
|
||||
if let ComputeStatus::TerminationPending { mode } = state.status {
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
self.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = mode == compute_api::responses::TerminateMode::Fast
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
@@ -1002,6 +944,77 @@ impl ComputeNode {
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
|
||||
match Url::parse(shard0_connstr)?.scheme() {
|
||||
"postgres" | "postgresql" => self.try_get_basebackup_libpq(spec, lsn),
|
||||
"grpc" => self.try_get_basebackup_grpc(spec, lsn),
|
||||
scheme => return Err(anyhow!("unknown URL scheme {scheme}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
let chunks = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::proto::PageServiceClient::connect(shard0_connstr).await?;
|
||||
|
||||
let req = page_api::proto::GetBaseBackupRequest {
|
||||
read_lsn: Some(page_api::proto::ReadLsn {
|
||||
request_lsn: lsn.0,
|
||||
not_modified_since_lsn: 0,
|
||||
}),
|
||||
replica: false, // TODO: handle replicas, with LSN 0
|
||||
};
|
||||
let mut req = tonic::Request::new(req);
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", spec.tenant_id.to_string().parse()?);
|
||||
metadata.insert("neon-timeline-id", spec.timeline_id.to_string().parse()?);
|
||||
metadata.insert("neon-shard-id", "0000".to_string().parse()?); // TODO: shard count
|
||||
if let Some(auth) = spec.storage_auth_token.as_ref() {
|
||||
metadata.insert("authorization", format!("Bearer {auth}").parse()?);
|
||||
}
|
||||
|
||||
let chunks = client.get_base_backup(req).await?.into_inner();
|
||||
anyhow::Ok(chunks)
|
||||
})?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
// Convert the chunks stream into an AsyncRead
|
||||
let stream_reader = StreamReader::new(
|
||||
chunks.map(|chunk| chunk.map(|c| c.chunk).map_err(std::io::Error::other)),
|
||||
);
|
||||
|
||||
// Wrap the AsyncRead into a blocking reader for compatibility with tar::Archive
|
||||
let reader = tokio_util::io::SyncIoBridge::new(stream_reader);
|
||||
let mut measured_reader = MeasuredReader::new(reader);
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.params.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
@@ -1017,12 +1030,10 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
config.application_name("compute_ctl");
|
||||
if let Some(spec) = &compute_state.pspec {
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
}
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
@@ -1096,10 +1107,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1811,7 +1819,7 @@ impl ComputeNode {
|
||||
|
||||
// exit loop
|
||||
ComputeStatus::Failed
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPending
|
||||
| ComputeStatus::Terminated => break 'cert_update,
|
||||
|
||||
// wait
|
||||
@@ -1977,7 +1985,7 @@ LIMIT 100",
|
||||
self.params
|
||||
.remote_ext_base_url
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
.ok_or(DownloadError::BadInput(anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
@@ -2173,7 +2181,7 @@ LIMIT 100",
|
||||
let remote_extensions = spec
|
||||
.remote_extensions
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
|
||||
.ok_or(anyhow!("Remote extensions are not configured"))?;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
@@ -2312,68 +2320,12 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal(dev_mode: bool) {
|
||||
pub fn forward_termination_signal() {
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
|
||||
if !dev_mode {
|
||||
// Terminate pgbouncer with SIGKILL
|
||||
match pid_file::read(PGBOUNCER_PIDFILE.into()) {
|
||||
Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
|
||||
info!("sending SIGKILL to pgbouncer process pid: {}", pid);
|
||||
if let Err(e) = kill(pid, Signal::SIGKILL) {
|
||||
error!("failed to terminate pgbouncer: {}", e);
|
||||
}
|
||||
}
|
||||
// pgbouncer does not lock the pid file, so we read and kill the process directly
|
||||
Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
|
||||
if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) {
|
||||
if let Ok(pid) = pid_str.trim().parse::<i32>() {
|
||||
info!(
|
||||
"sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)",
|
||||
pid
|
||||
);
|
||||
if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) {
|
||||
error!("failed to terminate pgbouncer: {}", e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("pgbouncer pid file exists but process not running");
|
||||
}
|
||||
}
|
||||
Ok(pid_file::PidFileRead::NotExist) => {
|
||||
info!("pgbouncer pid file not found, process may not be running");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error reading pgbouncer pid file: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Terminate local_proxy
|
||||
match pid_file::read("/etc/local_proxy/pid".into()) {
|
||||
Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
|
||||
info!("sending SIGTERM to local_proxy process pid: {}", pid);
|
||||
if let Err(e) = kill(pid, Signal::SIGTERM) {
|
||||
error!("failed to terminate local_proxy: {}", e);
|
||||
}
|
||||
}
|
||||
Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
|
||||
info!("local_proxy PID file exists but process not running");
|
||||
}
|
||||
Ok(pid_file::PidFileRead::NotExist) => {
|
||||
info!("local_proxy PID file not found, process may not be running");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error reading local_proxy PID file: {}", e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Skipping pgbouncer and local_proxy termination because in dev mode");
|
||||
}
|
||||
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
@@ -2406,21 +2358,3 @@ impl<T: 'static> JoinSetExt<T> for tokio::task::JoinSet<T> {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn duplicate_safekeeper_connstring() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
match ParsedSpec::try_from(spec.clone()) {
|
||||
Ok(_p) => panic!("Failed to detect duplicate entry"),
|
||||
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,16 +25,11 @@ struct EndpointStoragePair {
|
||||
}
|
||||
|
||||
const KEY: &str = "lfc_state";
|
||||
impl EndpointStoragePair {
|
||||
/// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
|
||||
/// If not None, takes precedence over pspec.spec.endpoint_id
|
||||
fn from_spec_and_endpoint(
|
||||
pspec: &crate::compute::ParsedSpec,
|
||||
endpoint_id: Option<String>,
|
||||
) -> Result<Self> {
|
||||
let endpoint_id = endpoint_id.as_ref().or(pspec.spec.endpoint_id.as_ref());
|
||||
let Some(ref endpoint_id) = endpoint_id else {
|
||||
bail!("pspec.endpoint_id missing, other endpoint_id not provided")
|
||||
impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(pspec: &crate::compute::ParsedSpec) -> Result<Self, Self::Error> {
|
||||
let Some(ref endpoint_id) = pspec.spec.endpoint_id else {
|
||||
bail!("pspec.endpoint_id missing")
|
||||
};
|
||||
let Some(ref base_uri) = pspec.endpoint_storage_addr else {
|
||||
bail!("pspec.endpoint_storage_addr missing")
|
||||
@@ -89,7 +84,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Returns false if there is a prewarm request ongoing, true otherwise
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
pub fn prewarm_lfc(self: &Arc<Self>) -> bool {
|
||||
crate::metrics::LFC_PREWARM_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
@@ -102,7 +97,7 @@ impl ComputeNode {
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
|
||||
let Err(err) = cloned.prewarm_impl().await else {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
@@ -114,14 +109,13 @@ impl ComputeNode {
|
||||
true
|
||||
}
|
||||
|
||||
/// from_endpoint: None for endpoint managed by this compute_ctl
|
||||
fn endpoint_storage_pair(&self, from_endpoint: Option<String>) -> Result<EndpointStoragePair> {
|
||||
fn endpoint_storage_pair(&self) -> Result<EndpointStoragePair> {
|
||||
let state = self.state.lock().unwrap();
|
||||
EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
|
||||
state.pspec.as_ref().unwrap().try_into()
|
||||
}
|
||||
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
async fn prewarm_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
@@ -179,7 +173,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
info!(%url, "requesting LFC state from postgres");
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
|
||||
@@ -2,7 +2,6 @@ use crate::compute_prewarm::LfcPrewarmStateWithProgress;
|
||||
use crate::http::JsonResponse;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Json, http::StatusCode};
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::LfcOffloadState;
|
||||
type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
|
||||
|
||||
@@ -17,16 +16,8 @@ pub(in crate::http) async fn offload_state(compute: Compute) -> Json<LfcOffloadS
|
||||
Json(compute.lfc_offload_state())
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
pub struct PrewarmQuery {
|
||||
pub from_endpoint: String,
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn prewarm(
|
||||
compute: Compute,
|
||||
OptionalQuery(query): OptionalQuery<PrewarmQuery>,
|
||||
) -> Response {
|
||||
if compute.prewarm_lfc(query.map(|q| q.from_endpoint)) {
|
||||
pub(in crate::http) async fn prewarm(compute: Compute) -> Response {
|
||||
if compute.prewarm_lfc() {
|
||||
StatusCode::ACCEPTED.into_response()
|
||||
} else {
|
||||
JsonResponse::error(
|
||||
|
||||
@@ -1,42 +1,32 @@
|
||||
use crate::compute::{ComputeNode, forward_termination_signal};
|
||||
use crate::http::JsonResponse;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::{ComputeStatus, TerminateResponse};
|
||||
use http::StatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::extract::State;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
pub struct TerminateQuery {
|
||||
mode: compute_api::responses::TerminateMode,
|
||||
}
|
||||
use crate::compute::{ComputeNode, forward_termination_signal};
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Terminate the compute.
|
||||
pub(in crate::http) async fn terminate(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
OptionalQuery(terminate): OptionalQuery<TerminateQuery>,
|
||||
) -> Response {
|
||||
let mode = terminate.unwrap_or_default().mode;
|
||||
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
|
||||
return StatusCode::CREATED.into_response();
|
||||
}
|
||||
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
state.set_status(
|
||||
ComputeStatus::TerminationPending { mode },
|
||||
&compute.state_changed,
|
||||
);
|
||||
|
||||
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
forward_termination_signal(false);
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
@@ -44,7 +34,7 @@ pub(in crate::http) async fn terminate(
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
let lsn = task::spawn_blocking(move || {
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
@@ -54,10 +44,11 @@ pub(in crate::http) async fn terminate(
|
||||
state.status
|
||||
);
|
||||
}
|
||||
state.terminate_flush_lsn
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
info!("terminated Postgres");
|
||||
JsonResponse::success(StatusCode::OK, TerminateResponse { lsn })
|
||||
|
||||
StatusCode::OK.into_response()
|
||||
}
|
||||
|
||||
@@ -22,7 +22,6 @@ mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod pgbouncer;
|
||||
pub mod rsyslog;
|
||||
pub mod spec;
|
||||
mod spec_apply;
|
||||
|
||||
@@ -83,9 +83,7 @@ impl ComputeMonitor {
|
||||
let compute_status = self.compute.get_status();
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::Failed
|
||||
ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
"compute is in {} status, stopping compute monitor",
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid";
|
||||
@@ -1,6 +0,0 @@
|
||||
### Test files
|
||||
|
||||
The file `cluster_spec.json` has been copied over from libs/compute_api
|
||||
tests, with some edits:
|
||||
|
||||
- the neon.safekeepers setting contains a duplicate value
|
||||
@@ -1,245 +0,0 @@
|
||||
{
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2021-05-23T18:25:43.511Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "test-cluster-42",
|
||||
"name": "Zenith Test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "postgres",
|
||||
"encrypted_password": "6b1d16b78004bbd51fa06af9eda75972",
|
||||
"options": null
|
||||
},
|
||||
{
|
||||
"name": "alexk",
|
||||
"encrypted_password": null,
|
||||
"options": null
|
||||
},
|
||||
{
|
||||
"name": "zenith \"new\"",
|
||||
"encrypted_password": "5b1d16b78004bbd51fa06af9eda75972",
|
||||
"options": null
|
||||
},
|
||||
{
|
||||
"name": "zen",
|
||||
"encrypted_password": "9b1d16b78004bbd51fa06af9eda75972"
|
||||
},
|
||||
{
|
||||
"name": "\"name\";\\n select 1;",
|
||||
"encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
|
||||
},
|
||||
{
|
||||
"name": "MyRole",
|
||||
"encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
{
|
||||
"name": "DB2",
|
||||
"owner": "alexk",
|
||||
"options": [
|
||||
{
|
||||
"name": "LC_COLLATE",
|
||||
"value": "C",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "LC_CTYPE",
|
||||
"value": "C",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "TEMPLATE",
|
||||
"value": "template0",
|
||||
"vartype": "enum"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "zenith",
|
||||
"owner": "MyRole"
|
||||
},
|
||||
{
|
||||
"name": "zen",
|
||||
"owner": "zen"
|
||||
}
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "hot_standby",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "prewarm_lfc_on_startup",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501,127.0.0.1:6502",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "32768",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55432",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "maintenance_work_mem",
|
||||
"value": "65536",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_parallel_workers",
|
||||
"value": "8",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_worker_processes",
|
||||
"value": "8",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "b0554b632bd4d547a63b86c3630317e8",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "2414a61ffc94e428f14b5758fe308e13",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=127.0.0.1 port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "test.escaping",
|
||||
"value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_db",
|
||||
"name": "zenith_test"
|
||||
},
|
||||
{
|
||||
"action": "rename_db",
|
||||
"name": "DB",
|
||||
"new_name": "DB2"
|
||||
},
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": "zenith2"
|
||||
},
|
||||
{
|
||||
"action": "rename_role",
|
||||
"name": "zenith new",
|
||||
"new_name": "zenith \"new\""
|
||||
}
|
||||
],
|
||||
"remote_extensions": {
|
||||
"library_index": {
|
||||
"postgis-3": "postgis",
|
||||
"libpgrouting-3.4": "postgis",
|
||||
"postgis_raster-3": "postgis",
|
||||
"postgis_sfcgal-3": "postgis",
|
||||
"postgis_topology-3": "postgis",
|
||||
"address_standardizer-3": "postgis"
|
||||
},
|
||||
"extension_data": {
|
||||
"postgis": {
|
||||
"archive_path": "5834329303/v15/extensions/postgis.tar.zst",
|
||||
"control_data": {
|
||||
"postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
|
||||
"pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
|
||||
"postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
|
||||
"postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
|
||||
"postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
|
||||
"address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
|
||||
"postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
|
||||
"address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
"custom_extensions": [],
|
||||
"public_extensions": ["postgis"]
|
||||
},
|
||||
"pgbouncer_settings": {
|
||||
"default_pool_size": "42",
|
||||
"pool_mode": "session"
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,6 @@ pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
http-utils.workspace = true
|
||||
|
||||
@@ -18,7 +18,7 @@ use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
|
||||
use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -45,7 +45,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
@@ -605,14 +605,6 @@ struct EndpointCreateCmdArgs {
|
||||
#[clap(long, help = "Postgres version")]
|
||||
pg_version: u32,
|
||||
|
||||
/// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
|
||||
///
|
||||
/// Specified on creation such that it's retained across reconfiguration and restarts.
|
||||
///
|
||||
/// NB: not yet supported by computes.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "If set, the node will be a hot replica on the specified timeline",
|
||||
@@ -673,12 +665,9 @@ struct EndpointStartCmdArgs {
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Run in development mode, skipping VM-specific operations like process termination",
|
||||
action = clap::ArgAction::SetTrue
|
||||
)]
|
||||
dev: bool,
|
||||
/// If enabled, use gRPC (and the communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -697,6 +686,10 @@ struct EndpointReconfigureCmdArgs {
|
||||
|
||||
#[clap(long)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
/// If enabled, use gRPC (and communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -711,9 +704,10 @@ struct EndpointStopCmdArgs {
|
||||
)]
|
||||
destroy: bool,
|
||||
|
||||
#[clap(long, help = "Postgres shutdown mode")]
|
||||
#[clap(default_value = "fast")]
|
||||
mode: EndpointTerminateMode,
|
||||
#[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")]
|
||||
#[arg(value_parser(["smart", "fast", "immediate"]))]
|
||||
#[arg(default_value = "fast")]
|
||||
mode: String,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -1269,45 +1263,6 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version)
|
||||
.await?;
|
||||
if env.storage_controller.timelines_onto_safekeepers {
|
||||
println!("Creating timeline on safekeeper ...");
|
||||
let timeline_info = pageserver
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
pageserver_client::mgmt_api::ForceAwaitLogicalSize::No,
|
||||
)
|
||||
.await?;
|
||||
let default_sk = SafekeeperNode::from_env(env, env.safekeepers.first().unwrap());
|
||||
let default_host = default_sk
|
||||
.conf
|
||||
.listen_addr
|
||||
.clone()
|
||||
.unwrap_or_else(|| "localhost".to_string());
|
||||
let mconf = safekeeper_api::membership::Configuration {
|
||||
generation: SafekeeperGeneration::new(1),
|
||||
members: safekeeper_api::membership::MemberSet {
|
||||
m: vec![SafekeeperId {
|
||||
host: default_host,
|
||||
id: default_sk.conf.id,
|
||||
pg_port: default_sk.conf.pg_port,
|
||||
}],
|
||||
},
|
||||
new_members: None,
|
||||
};
|
||||
let pg_version = args.pg_version * 10000;
|
||||
let req = safekeeper_api::models::TimelineCreateRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf,
|
||||
pg_version,
|
||||
system_id: None,
|
||||
wal_seg_size: None,
|
||||
start_lsn: timeline_info.last_record_lsn,
|
||||
commit_lsn: None,
|
||||
};
|
||||
default_sk.create_timeline(&req).await?;
|
||||
}
|
||||
env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -1465,7 +1420,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
args.internal_http_port,
|
||||
args.pg_version,
|
||||
mode,
|
||||
args.grpc,
|
||||
!args.update_catalog,
|
||||
false,
|
||||
)?;
|
||||
@@ -1507,19 +1461,17 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
(vec![(protocol, host, port)], DEFAULT_STRIPE_SIZE)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
@@ -1538,19 +1490,21 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.await?;
|
||||
}
|
||||
|
||||
let pageserver = if endpoint.grpc {
|
||||
let pageserver = if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
@@ -1596,7 +1550,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
args.dev,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1609,17 +1562,15 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
vec![pageserver]
|
||||
vec![(protocol, host, port)]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
@@ -1628,11 +1579,10 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
// Use gRPC if requested.
|
||||
if endpoint.grpc {
|
||||
if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
@@ -1657,10 +1607,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.endpoints
|
||||
.get(endpoint_id)
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
match endpoint.stop(args.mode, args.destroy).await?.lsn {
|
||||
Some(lsn) => println!("{lsn}"),
|
||||
None => println!("null"),
|
||||
}
|
||||
endpoint.stop(&args.mode, args.destroy)?;
|
||||
}
|
||||
EndpointCmd::GenerateJwt(args) => {
|
||||
let endpoint = {
|
||||
@@ -2092,16 +2039,11 @@ async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
let mode = if immediate {
|
||||
EndpointTerminateMode::Immediate
|
||||
} else {
|
||||
EndpointTerminateMode::Fast
|
||||
};
|
||||
// Stop all endpoints
|
||||
match ComputeControlPlane::load(env.clone()) {
|
||||
Ok(cplane) => {
|
||||
for (_k, node) in cplane.endpoints {
|
||||
if let Err(e) = node.stop(mode, false).await {
|
||||
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
|
||||
eprintln!("postgres stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,14 +46,11 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use base64::Engine;
|
||||
use base64::prelude::BASE64_URL_SAFE_NO_PAD;
|
||||
use compute_api::requests::{
|
||||
COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
|
||||
};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
|
||||
TlsConfig,
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
@@ -90,7 +87,6 @@ pub struct EndpointConf {
|
||||
external_http_port: u16,
|
||||
internal_http_port: u16,
|
||||
pg_version: u32,
|
||||
grpc: bool,
|
||||
skip_pg_catalog_updates: bool,
|
||||
reconfigure_concurrency: usize,
|
||||
drop_subscriptions_before_start: bool,
|
||||
@@ -168,7 +164,7 @@ impl ComputeControlPlane {
|
||||
public_key_use: Some(PublicKeyUse::Signature),
|
||||
key_operations: Some(vec![KeyOperations::Verify]),
|
||||
key_algorithm: Some(KeyAlgorithm::EdDSA),
|
||||
key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)),
|
||||
key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
|
||||
x509_url: None::<String>,
|
||||
x509_chain: None::<Vec<String>>,
|
||||
x509_sha1_fingerprint: None::<String>,
|
||||
@@ -177,7 +173,7 @@ impl ComputeControlPlane {
|
||||
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
|
||||
key_type: OctetKeyPairType::OctetKeyPair,
|
||||
curve: EllipticCurve::Ed25519,
|
||||
x: BASE64_URL_SAFE_NO_PAD.encode(public_key),
|
||||
x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
|
||||
}),
|
||||
}],
|
||||
})
|
||||
@@ -194,7 +190,6 @@ impl ComputeControlPlane {
|
||||
internal_http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
grpc: bool,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
@@ -229,7 +224,6 @@ impl ComputeControlPlane {
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
grpc,
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
@@ -248,7 +242,6 @@ impl ComputeControlPlane {
|
||||
internal_http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
grpc,
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
reconfigure_concurrency: 1,
|
||||
@@ -303,8 +296,6 @@ pub struct Endpoint {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mode: ComputeMode,
|
||||
/// If true, the endpoint should use gRPC to communicate with Pageservers.
|
||||
pub grpc: bool,
|
||||
|
||||
// port and address of the Postgres server and `compute_ctl`'s HTTP APIs
|
||||
pub pg_address: SocketAddr,
|
||||
@@ -342,37 +333,16 @@ pub enum EndpointStatus {
|
||||
|
||||
impl Display for EndpointStatus {
|
||||
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
writer.write_str(match self {
|
||||
let s = match self {
|
||||
Self::Running => "running",
|
||||
Self::Stopped => "stopped",
|
||||
Self::Crashed => "crashed",
|
||||
Self::RunningNoPidfile => "running, no pidfile",
|
||||
})
|
||||
};
|
||||
write!(writer, "{}", s)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, clap::ValueEnum)]
|
||||
pub enum EndpointTerminateMode {
|
||||
#[default]
|
||||
/// Use pg_ctl stop -m fast
|
||||
Fast,
|
||||
/// Use pg_ctl stop -m immediate
|
||||
Immediate,
|
||||
/// Use /terminate?mode=immediate
|
||||
ImmediateTerminate,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EndpointTerminateMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match &self {
|
||||
EndpointTerminateMode::Fast => "fast",
|
||||
EndpointTerminateMode::Immediate => "immediate",
|
||||
EndpointTerminateMode::ImmediateTerminate => "immediate-terminate",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PageserverProtocol {
|
||||
Libpq,
|
||||
@@ -430,7 +400,6 @@ impl Endpoint {
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
grpc: conf.grpc,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
reconfigure_concurrency: conf.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
|
||||
@@ -712,7 +681,6 @@ impl Endpoint {
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
dev: bool,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -883,10 +851,6 @@ impl Endpoint {
|
||||
cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
|
||||
}
|
||||
|
||||
if dev {
|
||||
cmd.arg("--dev");
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
@@ -939,7 +903,7 @@ impl Endpoint {
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPending
|
||||
| ComputeStatus::Terminated => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
@@ -1061,27 +1025,8 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn stop(
|
||||
&self,
|
||||
mode: EndpointTerminateMode,
|
||||
destroy: bool,
|
||||
) -> Result<TerminateResponse> {
|
||||
// pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is
|
||||
// slow, and test runs time out. Solution: special mode "immediate-terminate"
|
||||
// which uses /terminate
|
||||
let response = if let EndpointTerminateMode::ImmediateTerminate = mode {
|
||||
let ip = self.external_http_address.ip();
|
||||
let port = self.external_http_address.port();
|
||||
let url = format!("http://{ip}:{port}/terminate?mode=immediate");
|
||||
let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?;
|
||||
let request = reqwest::Client::new().post(url).bearer_auth(token);
|
||||
let response = request.send().await.context("/terminate")?;
|
||||
let text = response.text().await.context("/terminate result")?;
|
||||
serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))?
|
||||
} else {
|
||||
self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?;
|
||||
TerminateResponse { lsn: None }
|
||||
};
|
||||
pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
|
||||
self.pg_ctl(&["-m", mode, "stop"], &None)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
@@ -1091,7 +1036,7 @@ impl Endpoint {
|
||||
// waiting. Sometimes we do *not* want this cleanup: tests intentionally
|
||||
// do stop when majority of safekeepers is down, so sync-safekeepers
|
||||
// would hang otherwise. This could be a separate flag though.
|
||||
let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast);
|
||||
let send_sigterm = destroy || mode == "immediate";
|
||||
self.wait_for_compute_ctl_to_exit(send_sigterm)?;
|
||||
if destroy {
|
||||
println!(
|
||||
@@ -1100,7 +1045,7 @@ impl Endpoint {
|
||||
);
|
||||
std::fs::remove_dir_all(self.endpoint_path())?;
|
||||
}
|
||||
Ok(response)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
||||
|
||||
@@ -236,7 +236,7 @@ impl Default for NeonStorageControllerConf {
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
long_reconcile_threshold: None,
|
||||
use_https_pageserver_api: false,
|
||||
timelines_onto_safekeepers: true,
|
||||
timelines_onto_safekeepers: false,
|
||||
use_https_safekeeper_api: false,
|
||||
use_local_compute_notifications: true,
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT};
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -253,10 +252,9 @@ impl PageServerNode {
|
||||
// the storage controller
|
||||
let metadata_path = datadir.join("metadata.json");
|
||||
|
||||
let http_host = "localhost".to_string();
|
||||
let (_, http_port) =
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT);
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
|
||||
let https_port = match self.conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
@@ -267,11 +265,12 @@ impl PageServerNode {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let (mut grpc_host, mut grpc_port) = (None, None);
|
||||
let mut grpc_host = None;
|
||||
let mut grpc_port = None;
|
||||
if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
|
||||
let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
|
||||
grpc_host = Some("localhost".to_string());
|
||||
grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT));
|
||||
grpc_port = Some(port.unwrap_or(51051));
|
||||
}
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
@@ -284,7 +283,7 @@ impl PageServerNode {
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
grpc_host,
|
||||
grpc_port,
|
||||
http_host,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
https_port,
|
||||
other: HashMap::from([(
|
||||
@@ -646,16 +645,4 @@ impl PageServerNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
pub async fn timeline_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
force_await_logical_size: mgmt_api::ForceAwaitLogicalSize,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let timeline_info = self
|
||||
.http_client
|
||||
.timeline_info(tenant_shard_id, timeline_id, force_await_logical_size)
|
||||
.await?;
|
||||
Ok(timeline_info)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::error::Error as _;
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
@@ -13,9 +14,9 @@ use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use safekeeper_client::mgmt_api;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
@@ -34,14 +35,25 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
fn err_from_client_err(err: mgmt_api::Error) -> SafekeeperHttpError {
|
||||
use mgmt_api::Error::*;
|
||||
match err {
|
||||
ApiError(_, str) => SafekeeperHttpError::Response(str),
|
||||
Cancelled => SafekeeperHttpError::Response("Cancelled".to_owned()),
|
||||
ReceiveBody(err) => SafekeeperHttpError::Transport(err),
|
||||
ReceiveErrorBody(err) => SafekeeperHttpError::Response(err),
|
||||
Timeout(str) => SafekeeperHttpError::Response(format!("timeout: {str}")),
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>().await {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,8 +70,9 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: mgmt_api::Client,
|
||||
pub http_client: reqwest::Client,
|
||||
pub listen_addr: String,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
@@ -69,14 +82,13 @@ impl SafekeeperNode {
|
||||
} else {
|
||||
"127.0.0.1".to_string()
|
||||
};
|
||||
let jwt = None;
|
||||
let http_base_url = format!("http://{}:{}", listen_addr, conf.http_port);
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: mgmt_api::Client::new(env.create_http_client(), http_base_url, jwt),
|
||||
http_client: env.create_http_client(),
|
||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||
listen_addr,
|
||||
}
|
||||
}
|
||||
@@ -266,19 +278,20 @@ impl SafekeeperNode {
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
self.http_client
|
||||
.status()
|
||||
.await
|
||||
.map_err(err_from_client_err)?;
|
||||
Ok(())
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::NeonJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
//}
|
||||
self.http_client.request(method, url)
|
||||
}
|
||||
|
||||
pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<()> {
|
||||
self.http_client
|
||||
.create_timeline(req)
|
||||
.await
|
||||
.map_err(err_from_client_err)?;
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()
|
||||
.await?
|
||||
.error_from_body()
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ enum Command {
|
||||
listen_pg_addr: String,
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
|
||||
#[arg(long)]
|
||||
listen_grpc_addr: Option<String>,
|
||||
#[arg(long)]
|
||||
@@ -65,16 +66,10 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
// Set a node status as deleted.
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Delete a tombstone of node from the storage controller.
|
||||
NodeDeleteTombstone {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
@@ -92,8 +87,6 @@ enum Command {
|
||||
},
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List soft deleted nodes known to the storage controller
|
||||
NodeTombstones {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {
|
||||
/// If this field is set, it will list the tenants on a specific node
|
||||
@@ -916,39 +909,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDeleteTombstone { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("debug/v1/tombstone/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeTombstones {} => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"debug/v1/tombstone".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
node.availability_zone_id,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
|
||||
@@ -95,4 +95,3 @@ echo "Start compute node"
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-${RANDOM}" \
|
||||
--config "${CONFIG_FILE}"
|
||||
--dev
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
#!/bin/sh
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "$0")"
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-common-${PG_VERSION}.patch' EXIT
|
||||
if [[ ${PG_VERSION} = v17 ]]; then
|
||||
sed -i '/computed_columns/d' regress/core/tests.mk
|
||||
fi
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-no-upgrade-test.patch' EXIT
|
||||
make installcheck-base
|
||||
@@ -1,35 +0,0 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..90987df 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -143,8 +143,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index 4b95b7e..449d5a2 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
@POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
@@ -1,19 +1,3 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..64a9254 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -144,11 +144,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index c051f03..010e493 100644
|
||||
--- a/regress/runtest.mk
|
||||
@@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 64a9254..94903c3 100644
|
||||
index 3abd7bc..94903c3 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
@@ -160,6 +160,18 @@ index 64a9254..94903c3 100644
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -144,11 +140,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index 1fc77ac..c3cb9de 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
|
||||
@@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 90987df..74fe3f1 100644
|
||||
index 9e05244..a63a3e1 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
|
||||
@@ -168,6 +168,16 @@ index 90987df..74fe3f1 100644
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -143,8 +139,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index ac4f8ad..4bad4fc 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
|
||||
@@ -10,8 +10,8 @@ psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='U
|
||||
-c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
|
||||
-c "CREATE EXTENSION postgis_raster SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
patch -p1 <"postgis-regular-${PG_VERSION}.patch"
|
||||
psql -d contrib_regression -f raster_outdb_template.sql
|
||||
trap 'patch -R -p1 <postgis-regular-${PG_VERSION}.patch && patch -R -p1 <"postgis-common-${PG_VERSION}.patch"' EXIT
|
||||
trap 'patch -R -p1 <postgis-no-upgrade-test.patch && patch -R -p1 <"postgis-regular-${PG_VERSION}.patch"' EXIT
|
||||
POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
|
||||
@@ -8,7 +8,6 @@ anyhow.workspace = true
|
||||
axum-extra.workspace = true
|
||||
axum.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
//! for large computes.
|
||||
mod app;
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
@@ -14,29 +12,13 @@ const fn max_upload_file_limit() -> usize {
|
||||
100 * 1024 * 1024
|
||||
}
|
||||
|
||||
const fn listen() -> SocketAddr {
|
||||
SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 51243)
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Args {
|
||||
#[arg(exclusive = true)]
|
||||
config_file: Option<String>,
|
||||
#[arg(long, default_value = "false", requires = "config")]
|
||||
/// to allow testing k8s helm chart where we don't have s3 credentials
|
||||
no_s3_check_on_startup: bool,
|
||||
#[arg(long, value_name = "FILE")]
|
||||
/// inline config mode for k8s helm chart
|
||||
config: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
struct Config {
|
||||
#[serde(default = "listen")]
|
||||
listen: std::net::SocketAddr,
|
||||
pemfile: camino::Utf8PathBuf,
|
||||
#[serde(flatten)]
|
||||
storage_kind: remote_storage::TypedRemoteStorageKind,
|
||||
storage_config: remote_storage::RemoteStorageConfig,
|
||||
#[serde(default = "max_upload_file_limit")]
|
||||
max_upload_file_limit: usize,
|
||||
}
|
||||
@@ -49,18 +31,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let args = Args::parse();
|
||||
let config: Config = if let Some(config_path) = args.config_file {
|
||||
info!("Reading config from {config_path}");
|
||||
let config = std::fs::read_to_string(config_path)?;
|
||||
serde_json::from_str(&config).context("parsing config")?
|
||||
} else if let Some(config) = args.config {
|
||||
info!("Reading inline config");
|
||||
serde_json::from_str(&config).context("parsing config")?
|
||||
} else {
|
||||
anyhow::bail!("Supply either config file path or --config=inline-config");
|
||||
};
|
||||
|
||||
let config: String = std::env::args().skip(1).take(1).collect();
|
||||
if config.is_empty() {
|
||||
anyhow::bail!("Usage: endpoint_storage config.json")
|
||||
}
|
||||
info!("Reading config from {config}");
|
||||
let config = std::fs::read_to_string(config.clone())?;
|
||||
let config: Config = serde_json::from_str(&config).context("parsing config")?;
|
||||
info!("Reading pemfile from {}", config.pemfile.clone());
|
||||
let pemfile = std::fs::read(config.pemfile.clone())?;
|
||||
info!("Loading public key from {}", config.pemfile.clone());
|
||||
@@ -69,12 +46,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
|
||||
info!("listening on {}", listener.local_addr().unwrap());
|
||||
|
||||
let storage =
|
||||
remote_storage::GenericRemoteStorage::from_storage_kind(config.storage_kind).await?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
if !args.no_s3_check_on_startup {
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
}
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
|
||||
let proxy = std::sync::Arc::new(endpoint_storage::Storage {
|
||||
auth,
|
||||
|
||||
@@ -83,16 +83,6 @@ pub struct ComputeStatusResponse {
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TerminateMode {
|
||||
#[default]
|
||||
/// wait 30s till returning from /terminate to allow control plane to get the error
|
||||
Fast,
|
||||
/// return from /terminate immediately as soon as all components are terminated
|
||||
Immediate,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
@@ -113,16 +103,11 @@ pub enum ComputeStatus {
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending { mode: TerminateMode },
|
||||
TerminationPending,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
pub struct TerminateResponse {
|
||||
pub lsn: Option<utils::lsn::Lsn>,
|
||||
}
|
||||
|
||||
impl Display for ComputeStatus {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -132,7 +117,7 @@ impl Display for ComputeStatus {
|
||||
ComputeStatus::Running => f.write_str("running"),
|
||||
ComputeStatus::Configuration => f.write_str("configuration"),
|
||||
ComputeStatus::Failed => f.write_str("failed"),
|
||||
ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
|
||||
ComputeStatus::TerminationPending => f.write_str("termination-pending"),
|
||||
ComputeStatus::Terminated => f.write_str("terminated"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
use regex::Regex;
|
||||
@@ -319,6 +320,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
|
||||
@@ -419,13 +419,13 @@ pub fn now() -> u64 {
|
||||
with_thread_context(|ctx| ctx.clock.get().unwrap().now())
|
||||
}
|
||||
|
||||
pub fn exit(code: i32, msg: String) -> ! {
|
||||
pub fn exit(code: i32, msg: String) {
|
||||
with_thread_context(|ctx| {
|
||||
ctx.allow_panic.store(true, Ordering::SeqCst);
|
||||
let mut result = ctx.result.lock();
|
||||
*result = (code, msg);
|
||||
panic!("exit");
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
|
||||
|
||||
@@ -6,8 +6,13 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace=true
|
||||
nix.workspace = true
|
||||
spin.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
|
||||
366
libs/neon-shmem/src/hash.rs
Normal file
366
libs/neon-shmem/src/hash.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
//! Hash table implementation on top of 'shmem'
|
||||
//!
|
||||
//! Features required in the long run by the communicator project:
|
||||
//!
|
||||
//! [X] Accessible from both Postgres processes and rust threads in the communicator process
|
||||
//! [X] Low latency
|
||||
//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
|
||||
//! [ ] Resizable
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use spin;
|
||||
|
||||
mod core;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use core::CoreHashMap;
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub struct HashMapInit<'a, K, V> {
|
||||
// Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
pub struct HashMapAccess<'a, K, V> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
|
||||
unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V> {
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
|
||||
// no difference to attach_writer currently
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
// This is stored in the shared memory area
|
||||
struct HashMapShared<'a, K, V> {
|
||||
inner: spin::RwLock<CoreHashMap<'a, K, V>>,
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
pub fn init_in_fixed_area(
|
||||
num_buckets: u32,
|
||||
area: &'a mut [MaybeUninit<u8>],
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
|
||||
}
|
||||
|
||||
/// Initialize a new hash map in the given shared memory area
|
||||
pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
|
||||
let ptr = unsafe { shmem.data_ptr.as_mut() };
|
||||
Self::init_common(num_buckets, Some(shmem), ptr, size)
|
||||
}
|
||||
|
||||
fn init_common(
|
||||
num_buckets: u32,
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
area_ptr: *mut u8,
|
||||
area_len: usize,
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
// carve out HashMapShared from the area. This does not include the hashmap's dictionary
|
||||
// and buckets.
|
||||
let mut ptr: *mut u8 = area_ptr;
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// the rest of the space is given to the hash map's dictionary and buckets
|
||||
let remaining_area = unsafe {
|
||||
std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
|
||||
};
|
||||
|
||||
let hashmap = CoreHashMap::new(num_buckets, remaining_area);
|
||||
unsafe {
|
||||
std::ptr::write(
|
||||
shared_ptr,
|
||||
HashMapShared {
|
||||
inner: spin::RwLock::new(hashmap),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
HashMapInit {
|
||||
shmem_handle: shmem_handle,
|
||||
shared_ptr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapAccess<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get(key) {
|
||||
None => None,
|
||||
Some(val_ref) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(&self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get(key);
|
||||
let action = value_fn(old_val);
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(new_val)) => {
|
||||
let _ = lock_guard.insert(key, new_val);
|
||||
}
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some(_), UpdateAction::Remove) => {
|
||||
let _ = lock_guard.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn_at_bucket<F>(
|
||||
&self,
|
||||
pos: usize,
|
||||
value_fn: F,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get_bucket(pos);
|
||||
let action = value_fn(old_val.map(|(_k, v)| v));
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some((key, _value)), UpdateAction::Remove) => {
|
||||
let key = key.clone();
|
||||
let _ = lock_guard.remove(&key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map. (An Iterator might be nicer. The communicator's
|
||||
/// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
|
||||
/// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
|
||||
pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get_bucket(pos) {
|
||||
None => None,
|
||||
Some((_key, val_ref)) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for metrics
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().buckets_in_use as usize
|
||||
}
|
||||
|
||||
/// Grow
|
||||
///
|
||||
/// 1. grow the underlying shared memory area
|
||||
/// 2. Initialize new buckets. This overwrites the current dictionary
|
||||
/// 3. Recalculate the dictionary
|
||||
pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
let inner = &mut *lock_guard;
|
||||
let old_num_buckets = inner.buckets.len() as u32;
|
||||
|
||||
if num_buckets < old_num_buckets {
|
||||
panic!("grow called with a smaller number of buckets");
|
||||
}
|
||||
if num_buckets == old_num_buckets {
|
||||
return Ok(());
|
||||
}
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("grow called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
|
||||
// the dictionary!
|
||||
let buckets_ptr = inner.buckets.as_mut_ptr();
|
||||
unsafe {
|
||||
for i in old_num_buckets..num_buckets {
|
||||
let bucket_ptr = buckets_ptr.add(i as usize);
|
||||
bucket_ptr.write(core::Bucket {
|
||||
hash: 0,
|
||||
next: if i < num_buckets {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
inner.free_head
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Recalculate the dictionary
|
||||
let buckets;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i] = core::INVALID_POS;
|
||||
}
|
||||
|
||||
for i in 0..old_num_buckets as usize {
|
||||
if buckets[i].inner.is_none() {
|
||||
continue;
|
||||
}
|
||||
let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
|
||||
buckets[i].next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
// Finally, update the CoreHashMap struct
|
||||
inner.dictionary = dictionary;
|
||||
inner.buckets = buckets;
|
||||
inner.free_head = old_num_buckets;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: Shrinking is a multi-step process that requires co-operation from the caller
|
||||
//
|
||||
// 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
|
||||
// buckets.
|
||||
//
|
||||
// 2. Next, the caller must evict all entries in higher-numbered buckets.
|
||||
//
|
||||
// 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
|
||||
// shmem area
|
||||
}
|
||||
|
||||
pub struct ValueReadGuard<'a, K, V> {
|
||||
_lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
|
||||
value: *const V,
|
||||
}
|
||||
|
||||
impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
|
||||
type Target = V;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
// SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
|
||||
// by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
|
||||
// valid reference within the map.
|
||||
unsafe { &*self.value }
|
||||
}
|
||||
}
|
||||
233
libs/neon-shmem/src/hash/core.rs
Normal file
233
libs/neon-shmem/src/hash/core.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
//! Simple hash table with chaining
|
||||
//!
|
||||
//! # Resizing
|
||||
//!
|
||||
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
// Bucket
|
||||
pub(crate) struct Bucket<K, V> {
|
||||
pub(crate) hash: u64,
|
||||
pub(crate) next: u32,
|
||||
pub(crate) inner: Option<(K, V)>,
|
||||
}
|
||||
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
pub(crate) buckets: &'a mut [Bucket<K, V>],
|
||||
pub(crate) free_head: u32,
|
||||
|
||||
// metrics
|
||||
pub(crate) buckets_in_use: u32,
|
||||
}
|
||||
|
||||
pub struct FullError();
|
||||
|
||||
impl<'a, K, V> CoreHashMap<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += size_of::<Bucket<K, V>>() * num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
|
||||
let len = area.len();
|
||||
|
||||
let mut ptr: *mut u8 = area.as_mut_ptr();
|
||||
let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
|
||||
let buckets_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
let dictionary_ptr = ptr;
|
||||
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
// Initialize the buckets
|
||||
let buckets = {
|
||||
let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
|
||||
for i in 0..buckets.len() {
|
||||
buckets[i].write(Bucket {
|
||||
hash: 0,
|
||||
next: if i < buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
|
||||
};
|
||||
|
||||
// Initialize the dictionary
|
||||
let dictionary = {
|
||||
let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
|
||||
let dictionary =
|
||||
unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
|
||||
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i].write(INVALID_POS);
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
}
|
||||
};
|
||||
|
||||
CoreHashMap {
|
||||
dictionary,
|
||||
buckets,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &K) -> Option<&V> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bucket = &self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(&bucket_value);
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let first = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = pos;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut next = first;
|
||||
loop {
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, update its value
|
||||
*bucket_value = value;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if bucket.next == INVALID_POS {
|
||||
// No existing entry found. Append to the chain
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.buckets[next as usize].next = pos;
|
||||
return Ok(());
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
let mut prev_pos: u32 = INVALID_POS;
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
// no existing entry
|
||||
return Ok(());
|
||||
}
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, unlink it from the chain
|
||||
if prev_pos == INVALID_POS {
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
|
||||
} else {
|
||||
self.buckets[prev_pos as usize].next = bucket.next;
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
bucket.hash = 0;
|
||||
bucket.inner = None;
|
||||
bucket.next = self.free_head;
|
||||
self.free_head = next;
|
||||
self.buckets_in_use -= 1;
|
||||
return Ok(());
|
||||
}
|
||||
prev_pos = next;
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.buckets.len()
|
||||
}
|
||||
|
||||
pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
|
||||
if pos >= self.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
self.buckets[pos].inner.as_ref()
|
||||
}
|
||||
|
||||
fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
|
||||
let pos = self.free_head;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
self.free_head = bucket.next;
|
||||
self.buckets_in_use += 1;
|
||||
|
||||
bucket.hash = hash;
|
||||
bucket.next = INVALID_POS;
|
||||
bucket.inner = Some((key, value));
|
||||
|
||||
return Ok(pos);
|
||||
}
|
||||
}
|
||||
220
libs/neon-shmem/src/hash/tests.rs
Normal file
220
libs/neon-shmem/src/hash/tests.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::UpdateAction;
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
|
||||
let w = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let x = w.get(&(*k).into());
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
sut: &HashMapAccess<TestKey, TestValue>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
sut.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
|
||||
writer.grow(1500).unwrap();
|
||||
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,418 +1,4 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
|
||||
418
libs/neon-shmem/src/shmem.rs
Normal file
418
libs/neon-shmem/src/shmem.rs
Normal file
@@ -0,0 +1,418 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
14
libs/neonart/Cargo.toml
Normal file
14
libs/neonart/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
crossbeam-utils.workspace = true
|
||||
spin.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
594
libs/neonart/src/algorithm.rs
Normal file
594
libs/neonart/src/algorithm.rs
Normal file
@@ -0,0 +1,594 @@
|
||||
mod lock_and_version;
|
||||
pub(crate) mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
|
||||
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
use crate::TreeWriteGuard;
|
||||
use crate::UpdateAction;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ArtError {
|
||||
ConcurrentUpdate, // need to retry
|
||||
OutOfMemory,
|
||||
}
|
||||
|
||||
impl From<ConcurrentUpdateError> for ArtError {
|
||||
fn from(_: ConcurrentUpdateError) -> ArtError {
|
||||
ArtError::ConcurrentUpdate
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OutOfMemoryError> for ArtError {
|
||||
fn from(_: OutOfMemoryError) -> ArtError {
|
||||
ArtError::OutOfMemory
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(
|
||||
allocator: &impl ArtAllocator<V>,
|
||||
) -> Result<RootPtr<V>, OutOfMemoryError> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<&'e V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter_next<'e, V: Value>(
|
||||
key: &[u8],
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<(Vec<u8>, &'e V)> {
|
||||
loop {
|
||||
let mut path = Vec::new();
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
match next_recurse(key, &mut path, root_ref, epoch_pin) {
|
||||
Ok(Some(v)) => {
|
||||
assert_eq!(path.len(), key.len());
|
||||
break Some((path, v));
|
||||
}
|
||||
Ok(None) => break None,
|
||||
Err(ConcurrentUpdateError()) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
|
||||
match update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
None,
|
||||
guard,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(ArtError::ConcurrentUpdate) => {
|
||||
continue; // retry
|
||||
}
|
||||
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if the prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), prefix_len);
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_recurse<'e, V: Value>(
|
||||
min_key: &[u8],
|
||||
path: &mut Vec<u8>,
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.extend_from_slice(prefix);
|
||||
}
|
||||
|
||||
use std::cmp::Ordering;
|
||||
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
|
||||
if comparison == Ordering::Less {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(path.len(), min_key.len());
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let mut min_key_byte = match comparison {
|
||||
Ordering::Less => unreachable!(), // checked this above already
|
||||
Ordering::Equal => min_key[path.len()],
|
||||
Ordering::Greater => 0,
|
||||
};
|
||||
|
||||
loop {
|
||||
match rnode.find_next_child_or_restart(min_key_byte)? {
|
||||
None => {
|
||||
return Ok(None);
|
||||
}
|
||||
Some((key_byte, child_ref)) => {
|
||||
let path_len = path.len();
|
||||
path.push(key_byte);
|
||||
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
|
||||
if result.is_some() {
|
||||
return Ok(result);
|
||||
}
|
||||
if key_byte == u8::MAX {
|
||||
return Ok(None);
|
||||
}
|
||||
path.truncate(path_len);
|
||||
min_key_byte = key_byte + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
pub(crate) fn update_recurse<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> Result<(), ArtError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len as usize..];
|
||||
let level = level + prefix_match_len as usize;
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), 0);
|
||||
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value. XXX: There might be concurrent reads though?
|
||||
let value_mut = wnode.get_leaf_value_mut();
|
||||
|
||||
match value_fn(Some(value_mut)) {
|
||||
UpdateAction::Nothing => {
|
||||
wparent.write_unlock();
|
||||
wnode.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
|
||||
UpdateAction::Remove => {
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wparent.delete_child(parent_key);
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
if let Some(rgrandparent) = rgrandparent {
|
||||
// FIXME: Ignore concurrency error. It doesn't lead to
|
||||
// corruption, but it means we might leak something. Until
|
||||
// another update cleans it up.
|
||||
let _ = cleanup_parent(wparent, rgrandparent, guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_to_node(&mut wnode, key, new_value, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
wnode.write_unlock();
|
||||
}
|
||||
return Ok(());
|
||||
} else {
|
||||
let next_child = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((ref rparent, _)) = rparent {
|
||||
rparent.check_or_restart()?;
|
||||
}
|
||||
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
rparent,
|
||||
guard,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0, dst);
|
||||
}
|
||||
|
||||
// TODO: return an Err if writeln!() returns error, instead of unwrapping
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let val = unsafe { vptr.as_ref().unwrap() };
|
||||
writeln!(dst, "{} {:?}: {:?}", indent, path, val).unwrap();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for key_byte in 0..=u8::MAX {
|
||||
match rnode.find_child_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(child_ref) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
writeln!(
|
||||
dst,
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(
|
||||
&key[common_prefix_len + 1..],
|
||||
value,
|
||||
guard.tree_writer.allocator,
|
||||
)?;
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
// FIXME: deallocate 'new_value_node' on OOM
|
||||
let mut prefix_node =
|
||||
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
|
||||
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_to_node<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
wnode.insert_child(key[0], value_child.into_ptr());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
|
||||
|
||||
// FIXME: deallocate 'bigger_node' on OOM
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
bigger_node.insert_new_child(key[0], value_child);
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wparent: WriteLockedNodeRef<V>,
|
||||
rgrandparent: (ReadLockedNodeRef<V>, u8),
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let (rgrandparent, grandparent_key_byte) = rgrandparent;
|
||||
|
||||
// If the parent becomes completely empty after the deletion, remove the parent from the
|
||||
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
|
||||
// TODO: not implemented.
|
||||
|
||||
// If the parent has only one child, replace the parent with the remaining child. (This is not
|
||||
// possible if the child's prefix field cannot absorb the parent's)
|
||||
if wparent.num_children() == 1 {
|
||||
// Try to lock the remaining child. This can fail if the child is updated
|
||||
// concurrently.
|
||||
let (key_byte, remaining_child) = wparent.find_remaining_child();
|
||||
|
||||
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
|
||||
|
||||
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
|
||||
// remaining leaf. Proceed with the updates.
|
||||
|
||||
// Update the prefix on the remaining leaf
|
||||
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
|
||||
|
||||
// Replace the pointer in the grandparent to point directly to the remaining leaf
|
||||
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
|
||||
|
||||
// Mark the parent as deleted.
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the parent's children would fit on a smaller node type after the deletion, replace it with
|
||||
// a smaller node.
|
||||
if wparent.can_shrink() {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
|
||||
|
||||
// Replace the pointer in the grandparent
|
||||
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// nothing to do
|
||||
wparent.write_unlock();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If the key is long, we
|
||||
// may need to allocate new internal nodes to hold it too
|
||||
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
|
||||
|
||||
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
)?;
|
||||
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
117
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
117
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Each node in the tree has contains one atomic word that stores three things:
|
||||
//!
|
||||
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
|
||||
//! but might still be accessed by concurrent readers until the epoch expires.
|
||||
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
|
||||
//! Bits 2-63: Version number, incremented every time the node is modified.
|
||||
//!
|
||||
//! AtomicLockAndVersion represents that.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct ConcurrentUpdateError();
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
&self,
|
||||
version: u64,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
let old = self.inner.load(Ordering::Relaxed);
|
||||
if is_obsolete(old) || is_locked(old) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
old,
|
||||
set_locked_bit(old),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while is_locked(version) {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
return version + 2;
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
return (version & 1) == 1;
|
||||
}
|
||||
|
||||
fn is_locked(version: u64) -> bool {
|
||||
return (version & 2) == 2;
|
||||
}
|
||||
1102
libs/neonart/src/algorithm/node_ptr.rs
Normal file
1102
libs/neonart/src/algorithm/node_ptr.rs
Normal file
File diff suppressed because it is too large
Load Diff
349
libs/neonart/src/algorithm/node_ref.rs
Normal file
349
libs/neonart/src/algorithm/node_ref.rs
Normal file
@@ -0,0 +1,349 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::Value;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.lockword().write_lock_or_restart()?;
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_child(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(child_ptr) => Ok(Some(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_next_child_or_restart(
|
||||
&self,
|
||||
min_key_byte: u8,
|
||||
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_next_child(min_key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some((k, child_ptr)) => Ok(Some((
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
|
||||
let result = self.ptr.get_leaf_value();
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
// Extend the lifetime.
|
||||
let result = std::ptr::from_ref(result);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn can_shrink(&self) -> bool {
|
||||
self.ptr.can_shrink()
|
||||
}
|
||||
|
||||
pub(crate) fn num_children(&self) -> usize {
|
||||
self.ptr.num_children()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
|
||||
self.ptr.prepend_prefix(prefix, prefix_byte)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
|
||||
self.ptr.get_leaf_value_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn grow<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.grow(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn shrink<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.shrink(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
|
||||
pub(crate) fn delete_child(&mut self, key_byte: u8) {
|
||||
self.ptr.delete_child(key_byte);
|
||||
}
|
||||
|
||||
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
|
||||
assert_eq!(self.num_children(), 1);
|
||||
let child_or_value = self.ptr.find_next_child(0);
|
||||
|
||||
match child_or_value {
|
||||
None => panic!("could not find only child in node"),
|
||||
Some((k, child_ptr)) => (
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
ptr: NodePtr<V>,
|
||||
allocator: &'a A,
|
||||
|
||||
extra_nodes: Vec<NodePtr<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V, A> NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
|
||||
self.ptr.insert_child(key_byte, child.as_ptr())
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
self.ptr = NodePtr::null();
|
||||
ptr
|
||||
}
|
||||
|
||||
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
|
||||
let child_ptr = child.into_ptr();
|
||||
self.ptr.insert_child(key_byte, child_ptr);
|
||||
self.extra_nodes.push(child_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.deallocate(self.allocator);
|
||||
for p in self.extra_nodes.iter() {
|
||||
p.deallocate(self.allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
158
libs/neonart/src/allocator.rs
Normal file
158
libs/neonart/src/allocator.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
pub mod block;
|
||||
mod multislab;
|
||||
mod slab;
|
||||
pub mod r#static;
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crate::allocator::multislab::MultiSlabAllocator;
|
||||
use crate::allocator::r#static::alloc_from_slice;
|
||||
|
||||
use spin;
|
||||
|
||||
use crate::Tree;
|
||||
pub use crate::algorithm::node_ptr::{
|
||||
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub trait ArtAllocator<V: crate::Value> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V>;
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
|
||||
}
|
||||
|
||||
pub struct ArtMultiSlabAllocator<'t, V>
|
||||
where
|
||||
V: crate::Value,
|
||||
{
|
||||
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
|
||||
|
||||
pub(crate) inner: MultiSlabAllocator<'t, 5>,
|
||||
|
||||
phantom_val: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
const LAYOUTS: [Layout; 5] = [
|
||||
Layout::new::<NodeInternal4<V>>(),
|
||||
Layout::new::<NodeInternal16<V>>(),
|
||||
Layout::new::<NodeInternal48<V>>(),
|
||||
Layout::new::<NodeInternal256<V>>(),
|
||||
Layout::new::<NodeLeaf<V>>(),
|
||||
];
|
||||
|
||||
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
|
||||
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
|
||||
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
|
||||
|
||||
let allocator = allocator_area.write(ArtMultiSlabAllocator {
|
||||
tree_area: spin::Mutex::new(Some(tree_area)),
|
||||
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
|
||||
phantom_val: PhantomData,
|
||||
});
|
||||
|
||||
allocator
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V> {
|
||||
let mut t = self.tree_area.lock();
|
||||
if let Some(tree_area) = t.take() {
|
||||
return tree_area.as_mut_ptr().cast();
|
||||
}
|
||||
panic!("cannot allocate more than one tree");
|
||||
}
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
|
||||
self.inner.alloc_slab(0).cast()
|
||||
}
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
|
||||
self.inner.alloc_slab(1).cast()
|
||||
}
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
|
||||
self.inner.alloc_slab(2).cast()
|
||||
}
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
|
||||
self.inner.alloc_slab(3).cast()
|
||||
}
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
|
||||
self.inner.alloc_slab(4).cast()
|
||||
}
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
|
||||
self.inner.dealloc_slab(0, ptr.cast())
|
||||
}
|
||||
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
|
||||
self.inner.dealloc_slab(1, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
|
||||
self.inner.dealloc_slab(2, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
|
||||
self.inner.dealloc_slab(3, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
|
||||
self.inner.dealloc_slab(4, ptr.cast())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
|
||||
ArtMultiSlabStats {
|
||||
num_internal4: self.inner.slab_descs[0]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal16: self.inner.slab_descs[1]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal48: self.inner.slab_descs[2]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal256: self.inner.slab_descs[3]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_leaf: self.inner.slab_descs[4]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
|
||||
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtMultiSlabStats {
|
||||
pub num_internal4: u64,
|
||||
pub num_internal16: u64,
|
||||
pub num_internal48: u64,
|
||||
pub num_internal256: u64,
|
||||
pub num_leaf: u64,
|
||||
|
||||
pub num_blocks_internal4: u64,
|
||||
pub num_blocks_internal16: u64,
|
||||
pub num_blocks_internal48: u64,
|
||||
pub num_blocks_internal256: u64,
|
||||
pub num_blocks_leaf: u64,
|
||||
}
|
||||
191
libs/neonart/src/allocator/block.rs
Normal file
191
libs/neonart/src/allocator/block.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
//! Simple allocator of fixed-size blocks
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
pub const BLOCK_SIZE: usize = 16 * 1024;
|
||||
|
||||
const INVALID_BLOCK: u64 = u64::MAX;
|
||||
|
||||
pub(crate) struct BlockAllocator<'t> {
|
||||
blocks_ptr: &'t [MaybeUninit<u8>],
|
||||
num_blocks: u64,
|
||||
num_initialized: AtomicU64,
|
||||
|
||||
freelist_head: spin::Mutex<u64>,
|
||||
}
|
||||
|
||||
struct FreeListBlock {
|
||||
inner: spin::Mutex<FreeListBlockInner>,
|
||||
}
|
||||
|
||||
struct FreeListBlockInner {
|
||||
next: u64,
|
||||
|
||||
num_free_blocks: u64,
|
||||
free_blocks: [u64; 100], // FIXME: fill the rest of the block
|
||||
}
|
||||
|
||||
impl<'t> BlockAllocator<'t> {
|
||||
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
|
||||
// Use all the space for the blocks
|
||||
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
|
||||
let remain = &mut area[padding..];
|
||||
|
||||
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
|
||||
|
||||
BlockAllocator {
|
||||
blocks_ptr: remain,
|
||||
num_blocks,
|
||||
num_initialized: AtomicU64::new(0),
|
||||
freelist_head: spin::Mutex::new(INVALID_BLOCK),
|
||||
}
|
||||
}
|
||||
|
||||
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
|
||||
/// reused for another kind of block
|
||||
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
|
||||
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
|
||||
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
|
||||
assert!(blkno < self.num_blocks);
|
||||
unsafe {
|
||||
self.blocks_ptr
|
||||
.as_ptr()
|
||||
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
|
||||
}
|
||||
.cast_mut()
|
||||
.cast()
|
||||
}
|
||||
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
|
||||
// FIXME: handle OOM
|
||||
let blkno = self.alloc_block_internal();
|
||||
if blkno == INVALID_BLOCK {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
|
||||
}
|
||||
|
||||
fn alloc_block_internal(&self) -> u64 {
|
||||
// check the free list.
|
||||
{
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
if g.num_free_blocks > 0 {
|
||||
g.num_free_blocks -= 1;
|
||||
let result = g.free_blocks[g.num_free_blocks as usize];
|
||||
return result;
|
||||
} else {
|
||||
// consume the freelist block itself
|
||||
let result = *freelist_head;
|
||||
*freelist_head = g.next;
|
||||
// This freelist block is now unlinked and can be repurposed
|
||||
drop(g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are some blocks left that we've never used, pick next such block
|
||||
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
|
||||
while next_uninitialized < self.num_blocks {
|
||||
match self.num_initialized.compare_exchange(
|
||||
next_uninitialized,
|
||||
next_uninitialized + 1,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
return next_uninitialized;
|
||||
}
|
||||
Err(old) => {
|
||||
next_uninitialized = old;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out of blocks
|
||||
return INVALID_BLOCK;
|
||||
}
|
||||
|
||||
// TODO: this is currently unused. The slab allocator never releases blocks
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
|
||||
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
|
||||
self.release_block_internal(blockno as u64);
|
||||
}
|
||||
|
||||
fn release_block_internal(&self, blockno: u64) {
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
let num_free_blocks = g.num_free_blocks;
|
||||
if num_free_blocks < g.free_blocks.len() as u64 {
|
||||
g.free_blocks[num_free_blocks as usize] = blockno;
|
||||
g.num_free_blocks += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the block into a new freelist block
|
||||
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
|
||||
let init = FreeListBlock {
|
||||
inner: spin::Mutex::new(FreeListBlockInner {
|
||||
next: *freelist_head,
|
||||
num_free_blocks: 0,
|
||||
free_blocks: [INVALID_BLOCK; 100],
|
||||
}),
|
||||
};
|
||||
unsafe { (*block_ptr) = init };
|
||||
*freelist_head = blockno;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
|
||||
let mut num_free_blocks = 0;
|
||||
|
||||
let mut _prev_lock = None;
|
||||
let head_lock = self.freelist_head.lock();
|
||||
let mut next_blk = *head_lock;
|
||||
let mut _head_lock = Some(head_lock);
|
||||
while next_blk != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(next_blk);
|
||||
let lock = freelist_block.inner.lock();
|
||||
num_free_blocks += lock.num_free_blocks;
|
||||
next_blk = lock.next;
|
||||
_prev_lock = Some(lock); // hold the lock until we've read the next block
|
||||
_head_lock = None;
|
||||
}
|
||||
|
||||
BlockAllocatorStats {
|
||||
num_blocks: self.num_blocks,
|
||||
num_initialized: self.num_initialized.load(Ordering::Relaxed),
|
||||
num_free_blocks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BlockAllocatorStats {
|
||||
pub num_blocks: u64,
|
||||
pub num_initialized: u64,
|
||||
pub num_free_blocks: u64,
|
||||
}
|
||||
33
libs/neonart/src/allocator/multislab.rs
Normal file
33
libs/neonart/src/allocator/multislab.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::allocator::block::BlockAllocator;
|
||||
use crate::allocator::slab::SlabDesc;
|
||||
|
||||
pub struct MultiSlabAllocator<'t, const N: usize> {
|
||||
pub(crate) block_allocator: BlockAllocator<'t>,
|
||||
|
||||
pub(crate) slab_descs: [SlabDesc; N],
|
||||
}
|
||||
|
||||
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
|
||||
pub(crate) fn new(
|
||||
area: &'t mut [MaybeUninit<u8>],
|
||||
layouts: &[Layout; N],
|
||||
) -> MultiSlabAllocator<'t, N> {
|
||||
let block_allocator = BlockAllocator::new(area);
|
||||
MultiSlabAllocator {
|
||||
block_allocator,
|
||||
|
||||
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
|
||||
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
|
||||
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
|
||||
}
|
||||
}
|
||||
432
libs/neonart/src/allocator/slab.rs
Normal file
432
libs/neonart/src/allocator/slab.rs
Normal file
@@ -0,0 +1,432 @@
|
||||
//! A slab allocator that carves out fixed-size chunks from larger blocks.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
use super::alloc_from_slice;
|
||||
use super::block::BlockAllocator;
|
||||
|
||||
use crate::allocator::block::BLOCK_SIZE;
|
||||
|
||||
pub(crate) struct SlabDesc {
|
||||
pub(crate) layout: Layout,
|
||||
|
||||
block_lists: spin::RwLock<BlockLists>,
|
||||
|
||||
pub(crate) num_blocks: AtomicU64,
|
||||
pub(crate) num_allocated: AtomicU64,
|
||||
}
|
||||
|
||||
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
|
||||
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
|
||||
// the art tree, SlabDescs are only moved during initialization.
|
||||
unsafe impl Sync for SlabDesc {}
|
||||
unsafe impl Send for SlabDesc {}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct BlockLists {
|
||||
full_blocks: BlockList,
|
||||
nonfull_blocks: BlockList,
|
||||
}
|
||||
|
||||
impl BlockLists {
|
||||
// Unlink a node. It must be in either one of the two lists.
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
let list = unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
if self.full_blocks.tail == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else if (*elem).prev.is_null() {
|
||||
if self.full_blocks.head == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
unsafe { unlink_slab_block(list, elem) };
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().tail, elem);
|
||||
list.as_mut().unwrap().tail = (*elem).prev;
|
||||
} else {
|
||||
assert_eq!((*(*elem).next).prev, elem);
|
||||
(*(*elem).next).prev = (*elem).prev;
|
||||
}
|
||||
if (*elem).prev.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().head, elem);
|
||||
list.as_mut().unwrap().head = (*elem).next;
|
||||
} else {
|
||||
assert_eq!((*(*elem).prev).next, elem);
|
||||
(*(*elem).prev).next = (*elem).next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BlockList {
|
||||
head: *mut SlabBlockHeader,
|
||||
tail: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
impl Default for BlockList {
|
||||
fn default() -> Self {
|
||||
BlockList {
|
||||
head: std::ptr::null_mut(),
|
||||
tail: std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockList {
|
||||
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if self.is_empty() {
|
||||
self.tail = elem;
|
||||
(*elem).next = std::ptr::null_mut();
|
||||
} else {
|
||||
(*elem).next = self.head;
|
||||
(*self.head).prev = elem;
|
||||
}
|
||||
(*elem).prev = std::ptr::null_mut();
|
||||
self.head = elem;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.head.is_null()
|
||||
}
|
||||
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe { unlink_slab_block(Some(self), elem) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
let mut next = self.head;
|
||||
|
||||
while !next.is_null() {
|
||||
let n = unsafe { next.as_ref() }.unwrap();
|
||||
eprintln!(
|
||||
" blk {:?} (free {}/{})",
|
||||
next,
|
||||
n.num_free_chunks.load(Ordering::Relaxed),
|
||||
n.num_chunks
|
||||
);
|
||||
next = n.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub(crate) fn new(layout: &Layout) -> SlabDesc {
|
||||
SlabDesc {
|
||||
layout: *layout,
|
||||
block_lists: spin::RwLock::new(BlockLists::default()),
|
||||
num_allocated: AtomicU64::new(0),
|
||||
num_blocks: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex<*mut FreeChunk>,
|
||||
num_free_chunks: AtomicU32,
|
||||
num_chunks: u32, // this is really a constant for a given Layout
|
||||
|
||||
// these fields are protected by the lock on the BlockLists
|
||||
prev: *mut SlabBlockHeader,
|
||||
next: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
struct FreeChunk {
|
||||
next: *mut FreeChunk,
|
||||
}
|
||||
|
||||
enum ReadOrWriteGuard<'a, T> {
|
||||
Read(spin::RwLockReadGuard<'a, T>),
|
||||
Write(spin::RwLockWriteGuard<'a, T>),
|
||||
}
|
||||
|
||||
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
match self {
|
||||
ReadOrWriteGuard::Read(g) => g.deref(),
|
||||
ReadOrWriteGuard::Write(g) => g.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
|
||||
// Are there any free chunks?
|
||||
let mut acquire_write = false;
|
||||
'outer: loop {
|
||||
let mut block_lists_guard = if acquire_write {
|
||||
ReadOrWriteGuard::Write(self.block_lists.write())
|
||||
} else {
|
||||
ReadOrWriteGuard::Read(self.block_lists.read())
|
||||
};
|
||||
'inner: loop {
|
||||
let block_ptr = block_lists_guard.nonfull_blocks.head;
|
||||
if block_ptr.is_null() {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
if !(*free_chunks_head).is_null() {
|
||||
let result = *free_chunks_head;
|
||||
(*free_chunks_head) = (*result).next;
|
||||
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
return result.cast();
|
||||
}
|
||||
}
|
||||
|
||||
// The block at the head of the list was full. Grab write lock and retry
|
||||
match block_lists_guard {
|
||||
ReadOrWriteGuard::Read(_) => {
|
||||
acquire_write = true;
|
||||
continue 'outer;
|
||||
}
|
||||
ReadOrWriteGuard::Write(ref mut g) => {
|
||||
// move the node to the list of full blocks
|
||||
unsafe {
|
||||
g.nonfull_blocks.unlink(block_ptr);
|
||||
g.full_blocks.push_head(block_ptr);
|
||||
};
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no free chunks. Allocate a new block (and the chunk from that)
|
||||
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
|
||||
self.num_blocks.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Add the block to the list in the SlabDesc
|
||||
unsafe {
|
||||
let mut block_lists_guard = self.block_lists.write();
|
||||
block_lists_guard.nonfull_blocks.push_head(new_block);
|
||||
}
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
new_chunk
|
||||
}
|
||||
|
||||
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
|
||||
// Find the block it belongs to. You can find the block from the address. (And knowing the
|
||||
// layout, you could calculate the chunk number too.)
|
||||
let block_ptr: *mut SlabBlockHeader = {
|
||||
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
|
||||
chunk_ptr.with_addr(block_addr).cast()
|
||||
};
|
||||
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
|
||||
|
||||
// Mark the chunk as free in 'freechunks' list
|
||||
let num_chunks;
|
||||
let num_free_chunks;
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
(*chunk_ptr).next = *free_chunks_head;
|
||||
*free_chunks_head = chunk_ptr;
|
||||
|
||||
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
num_chunks = (*block_ptr).num_chunks;
|
||||
}
|
||||
|
||||
if num_free_chunks == 1 {
|
||||
// If the block was full previously, add it to the nonfull blocks list. Note that
|
||||
// we're not holding the lock anymore, so it can immediately become full again.
|
||||
// That's harmless, it will be moved back to the full list again when a call
|
||||
// to alloc_chunk() sees it.
|
||||
let mut block_lists = self.block_lists.write();
|
||||
unsafe {
|
||||
block_lists.unlink(block_ptr);
|
||||
block_lists.nonfull_blocks.push_head(block_ptr);
|
||||
};
|
||||
} else if num_free_chunks == num_chunks {
|
||||
// If the block became completely empty, move it to the free list
|
||||
// TODO
|
||||
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
|
||||
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
|
||||
//block_allocator.release_block()
|
||||
}
|
||||
|
||||
// update stats
|
||||
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn alloc_block_and_chunk(
|
||||
&self,
|
||||
block_allocator: &BlockAllocator,
|
||||
) -> (*mut SlabBlockHeader, *mut u8) {
|
||||
// fixme: handle OOM
|
||||
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
|
||||
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
|
||||
|
||||
let padding = remain.as_ptr().align_offset(self.layout.align());
|
||||
|
||||
let num_chunks = (remain.len() - padding) / self.layout.size();
|
||||
|
||||
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
|
||||
|
||||
unsafe {
|
||||
let mut chunk_ptr = first_chunk_ptr;
|
||||
for _ in 0..num_chunks - 1 {
|
||||
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
|
||||
(*chunk_ptr).next = next_chunk_ptr;
|
||||
chunk_ptr = next_chunk_ptr;
|
||||
}
|
||||
(*chunk_ptr).next = std::ptr::null_mut();
|
||||
|
||||
let result_chunk = first_chunk_ptr;
|
||||
|
||||
let block_header = block_header.write(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
num_chunks: num_chunks as u32,
|
||||
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
|
||||
});
|
||||
|
||||
(block_header, result_chunk.cast())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
eprintln!(
|
||||
"slab dump ({} blocks, {} allocated chunks)",
|
||||
self.num_blocks.load(Ordering::Relaxed),
|
||||
self.num_allocated.load(Ordering::Relaxed)
|
||||
);
|
||||
let lists = self.block_lists.read();
|
||||
|
||||
eprintln!("nonfull blocks:");
|
||||
lists.nonfull_blocks.dump();
|
||||
eprintln!("full blocks:");
|
||||
lists.full_blocks.dump();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use rand::Rng;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
struct TestObject {
|
||||
val: usize,
|
||||
_dummy: [u8; BLOCK_SIZE / 4],
|
||||
}
|
||||
|
||||
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
|
||||
impl<'a> TestObjectSlab<'a> {
|
||||
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
|
||||
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
|
||||
}
|
||||
|
||||
fn alloc(&self, val: usize) -> *mut TestObject {
|
||||
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
|
||||
unsafe { (*obj).val = val };
|
||||
obj
|
||||
}
|
||||
|
||||
fn dealloc(&self, obj: *mut TestObject) {
|
||||
self.0.dealloc_chunk(obj.cast(), &self.1)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slab_alloc() {
|
||||
const MEM_SIZE: usize = 100000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
let block_allocator = BlockAllocator::new(&mut area);
|
||||
|
||||
let slab = TestObjectSlab::new(block_allocator);
|
||||
|
||||
let mut all: Vec<*mut TestObject> = Vec::new();
|
||||
for i in 0..11 {
|
||||
all.push(slab.alloc(i));
|
||||
}
|
||||
for i in 0..11 {
|
||||
assert!(unsafe { (*all[i]).val == i });
|
||||
}
|
||||
|
||||
let distribution = Zipf::new(10 as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..100000 {
|
||||
slab.0.dump();
|
||||
let idx = (rng.sample(distribution) as usize).into();
|
||||
let ptr: *mut TestObject = all[idx];
|
||||
if !ptr.is_null() {
|
||||
assert_eq!(unsafe { (*ptr).val }, idx);
|
||||
slab.dealloc(ptr);
|
||||
all[idx] = std::ptr::null_mut();
|
||||
} else {
|
||||
all[idx] = slab.alloc(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
|
||||
Box::into_raw(Box::new(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
|
||||
num_free_chunks: AtomicU32::new(0),
|
||||
num_chunks: i,
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_linked_list() {
|
||||
// note: these are leaked, but that's OK for tests
|
||||
let a = new_test_blk(0);
|
||||
let b = new_test_blk(1);
|
||||
|
||||
let mut list = BlockList::default();
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(a);
|
||||
assert!(!list.is_empty());
|
||||
list.unlink(a);
|
||||
}
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(b);
|
||||
list.push_head(a);
|
||||
assert_eq!(list.head, a);
|
||||
assert_eq!((*a).next, b);
|
||||
assert_eq!((*b).prev, a);
|
||||
assert_eq!(list.tail, b);
|
||||
|
||||
list.unlink(a);
|
||||
list.unlink(b);
|
||||
assert!(list.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
44
libs/neonart/src/allocator/static.rs
Normal file
44
libs/neonart/src/allocator/static.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn alloc_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size());
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { result_ptr.as_mut().unwrap() };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
|
||||
pub fn alloc_array_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
len: usize,
|
||||
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() * len > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size() * len);
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
147
libs/neonart/src/epoch.rs
Normal file
147
libs/neonart/src/epoch.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_utils::CachePadded;
|
||||
use spin;
|
||||
|
||||
const NUM_SLOTS: usize = 1000;
|
||||
|
||||
/// This is the struct that is stored in shmem
|
||||
///
|
||||
/// bit 0: is it pinned or not?
|
||||
/// rest of the bits are the epoch counter.
|
||||
pub struct EpochShared {
|
||||
global_epoch: AtomicU64,
|
||||
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
|
||||
|
||||
broadcast_lock: spin::Mutex<()>,
|
||||
}
|
||||
|
||||
impl EpochShared {
|
||||
pub fn new() -> EpochShared {
|
||||
EpochShared {
|
||||
global_epoch: AtomicU64::new(2),
|
||||
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
|
||||
broadcast_lock: spin::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&self) -> LocalHandle {
|
||||
LocalHandle {
|
||||
global: self,
|
||||
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
|
||||
}
|
||||
}
|
||||
|
||||
fn release_pin(&self, slot: usize, _epoch: u64) {
|
||||
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
self.participants[slot].store(global_epoch, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
|
||||
// pick a slot
|
||||
let mut slot = slot_hint;
|
||||
let epoch = loop {
|
||||
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
|
||||
if old & 1 == 0 {
|
||||
// Got this slot
|
||||
break old;
|
||||
}
|
||||
|
||||
// the slot was busy by another thread / process. try a different slot
|
||||
slot += 1;
|
||||
if slot == NUM_SLOTS {
|
||||
slot = 0;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
(slot, epoch)
|
||||
}
|
||||
|
||||
pub(crate) fn advance(&self) -> u64 {
|
||||
// Advance the global epoch
|
||||
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
|
||||
let new_epoch = old_epoch + 2;
|
||||
|
||||
// Anyone that release their pin after this will update their slot.
|
||||
new_epoch
|
||||
}
|
||||
|
||||
pub(crate) fn broadcast(&self) {
|
||||
let Some(_guard) = self.broadcast_lock.try_lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
let old_epoch = epoch.wrapping_sub(2);
|
||||
|
||||
// Update all free slots.
|
||||
for i in 0..NUM_SLOTS {
|
||||
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
|
||||
let _ = self.participants[i].compare_exchange(
|
||||
old_epoch,
|
||||
epoch,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME: memory fence here, since we used Relaxed?
|
||||
}
|
||||
|
||||
pub(crate) fn get_oldest(&self) -> u64 {
|
||||
// Read all slots.
|
||||
let now = self.global_epoch.load(Ordering::Relaxed);
|
||||
let mut oldest = now;
|
||||
for i in 0..NUM_SLOTS {
|
||||
let this_epoch = self.participants[i].load(Ordering::Relaxed);
|
||||
let delta = now.wrapping_sub(this_epoch);
|
||||
if delta > u64::MAX / 2 {
|
||||
// this is very recent
|
||||
} else {
|
||||
if delta > now.wrapping_sub(oldest) {
|
||||
oldest = this_epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
oldest
|
||||
}
|
||||
|
||||
pub(crate) fn get_current(&self) -> u64 {
|
||||
self.global_epoch.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct EpochPin<'e> {
|
||||
slot: usize,
|
||||
pub(crate) epoch: u64,
|
||||
|
||||
handle: &'e LocalHandle<'e>,
|
||||
}
|
||||
|
||||
impl<'e> Drop for EpochPin<'e> {
|
||||
fn drop(&mut self) {
|
||||
self.handle.global.release_pin(self.slot, self.epoch);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalHandle<'g> {
|
||||
global: &'g EpochShared,
|
||||
|
||||
last_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<'g> LocalHandle<'g> {
|
||||
pub fn pin(&self) -> EpochPin {
|
||||
let (slot, epoch) = self
|
||||
.global
|
||||
.pin_internal(self.last_slot.load(Ordering::Relaxed));
|
||||
self.last_slot.store(slot, Ordering::Relaxed);
|
||||
EpochPin {
|
||||
handle: self,
|
||||
epoch,
|
||||
slot,
|
||||
}
|
||||
}
|
||||
}
|
||||
587
libs/neonart/src/lib.rs
Normal file
587
libs/neonart/src/lib.rs
Normal file
@@ -0,0 +1,587 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Single-value leaves.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
pub mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
use algorithm::node_ptr::NodePtr;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use allocator::ArtAllocator;
|
||||
pub use allocator::ArtMultiSlabAllocator;
|
||||
pub use allocator::OutOfMemoryError;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
// fixme obsolete, no longer needs Clone
|
||||
pub trait Value {}
|
||||
|
||||
const MAX_GARBAGE: usize = 1024;
|
||||
|
||||
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
|
||||
pub struct Tree<V: Value> {
|
||||
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
|
||||
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
|
||||
/// indirection to every lookup)
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
epoch: epoch::EpochShared,
|
||||
}
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
|
||||
unsafe impl<V: Value + Send> Send for Tree<V> {}
|
||||
|
||||
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
|
||||
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
|
||||
|
||||
impl<V> GarbageQueue<V> {
|
||||
fn new() -> GarbageQueue<V> {
|
||||
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
|
||||
self.0.push_front((ptr, epoch));
|
||||
}
|
||||
|
||||
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
|
||||
if let Some(back) = self.0.back() {
|
||||
if back.1 < cutoff_epoch {
|
||||
return Some(self.0.pop_back().unwrap().0);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
allocator: &'t A,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
pub allocator: &'t A,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
/// Obsolete nodes that cannot be recycled until their epoch expires.
|
||||
garbage: spin::Mutex<GarbageQueue<V>>,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'a, 't: 'a, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
|
||||
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
|
||||
let tree_ptr = allocator.alloc_tree();
|
||||
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
|
||||
let init = Tree {
|
||||
root: algorithm::new_root(allocator).expect("out of memory"),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
epoch: epoch::EpochShared::new(),
|
||||
};
|
||||
unsafe { tree_ptr.write(init) };
|
||||
|
||||
TreeInitStruct {
|
||||
tree: unsafe { tree_ptr.as_ref() },
|
||||
allocator,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
garbage: spin::Mutex::new(GarbageQueue::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess {
|
||||
tree: self.tree,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
|
||||
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
|
||||
where
|
||||
't: 'g,
|
||||
{
|
||||
TreeWriteGuard {
|
||||
tree_writer: self,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
created_garbage: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'e, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'e Tree<V>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
|
||||
pub fn get(&'e self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'e, K, V, A>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
created_garbage: bool,
|
||||
}
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
/// Get a value
|
||||
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
// FIXME: It's not clear if OOM is expected while removing. It seems
|
||||
// not nice, but shrinking a node can OOM. Then again, we could opt
|
||||
// to not shrink a node if we cannot allocate, to live a little longer.
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to remove value and return the old value.
|
||||
pub fn remove_and_return(self, key: &K) -> Option<V>
|
||||
where
|
||||
V: Clone,
|
||||
{
|
||||
let mut old = None;
|
||||
self.update_with_fn(key, |existing| {
|
||||
old = existing.cloned();
|
||||
UpdateAction::Remove
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
old
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
///
|
||||
/// The function is passed a reference to the existing value, if any. If the function
|
||||
/// returns None, the value is removed from the tree (or if there was no existing value,
|
||||
/// does nothing). If the function returns Some, the existing value is replaced, of if there
|
||||
/// was no existing value, it is inserted. FIXME: update comment
|
||||
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
|
||||
|
||||
if self.created_garbage {
|
||||
let _ = self.collect_garbage();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
|
||||
self.tree_writer
|
||||
.garbage
|
||||
.lock()
|
||||
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
|
||||
self.created_garbage = true;
|
||||
}
|
||||
|
||||
// returns number of nodes recycled
|
||||
fn collect_garbage(&self) -> usize {
|
||||
self.tree_writer.tree.epoch.advance();
|
||||
self.tree_writer.tree.epoch.broadcast();
|
||||
|
||||
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
|
||||
|
||||
let mut result = 0;
|
||||
let mut garbage_queue = self.tree_writer.garbage.lock();
|
||||
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
|
||||
ptr.deallocate(self.tree_writer.allocator);
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
done: bool,
|
||||
pub next_key: Vec<u8>,
|
||||
max_key: Option<Vec<u8>>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<K> TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
pub fn new_wrapping() -> TreeIterator<K> {
|
||||
let mut next_key = Vec::new();
|
||||
next_key.resize(K::KEY_LEN, 0);
|
||||
TreeIterator {
|
||||
done: false,
|
||||
next_key,
|
||||
max_key: None,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
|
||||
let result = TreeIterator {
|
||||
done: false,
|
||||
next_key: Vec::from(range.start.as_bytes()),
|
||||
max_key: Some(Vec::from(range.end.as_bytes())),
|
||||
phantom_key: PhantomData,
|
||||
};
|
||||
assert_eq!(result.next_key.len(), K::KEY_LEN);
|
||||
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
|
||||
where
|
||||
V: Value,
|
||||
{
|
||||
if self.done {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut wrapped_around = false;
|
||||
loop {
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
if let Some((k, v)) = algorithm::iter_next(
|
||||
&mut self.next_key,
|
||||
read_guard.tree.root,
|
||||
&read_guard.epoch_pin,
|
||||
) {
|
||||
assert_eq!(k.len(), K::KEY_LEN);
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
|
||||
// Check if we reached the end of the range
|
||||
if let Some(max_key) = &self.max_key {
|
||||
if k.as_slice() >= max_key.as_slice() {
|
||||
self.done = true;
|
||||
break None;
|
||||
}
|
||||
}
|
||||
|
||||
// increment the key
|
||||
self.next_key = k.clone();
|
||||
increment_key(self.next_key.as_mut_slice());
|
||||
let k = k.as_slice().into();
|
||||
|
||||
break Some((k, v));
|
||||
} else {
|
||||
if self.max_key.is_some() {
|
||||
self.done = true;
|
||||
} else {
|
||||
// Start from beginning
|
||||
if !wrapped_around {
|
||||
for i in 0..K::KEY_LEN {
|
||||
self.next_key[i] = 0;
|
||||
}
|
||||
wrapped_around = true;
|
||||
continue;
|
||||
} else {
|
||||
// The tree is completely empty
|
||||
// FIXME: perhaps we should remember the starting point instead.
|
||||
// Currently this will scan some ranges twice.
|
||||
break None;
|
||||
}
|
||||
}
|
||||
break None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_key(key: &mut [u8]) -> bool {
|
||||
for i in (0..key.len()).rev() {
|
||||
let (byte, overflow) = key[i].overflowing_add(1);
|
||||
key[i] = byte;
|
||||
if !overflow {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// Debugging functions
|
||||
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
|
||||
pub fn get_statistics(&self) -> ArtTreeStatistics {
|
||||
self.allocator.get_statistics();
|
||||
ArtTreeStatistics {
|
||||
blocks: self.allocator.inner.block_allocator.get_statistics(),
|
||||
slabs: self.allocator.get_statistics(),
|
||||
epoch: self.tree.epoch.get_current(),
|
||||
oldest_epoch: self.tree.epoch.get_oldest(),
|
||||
num_garbage: self.garbage.lock().0.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtTreeStatistics {
|
||||
pub blocks: allocator::block::BlockAllocatorStats,
|
||||
pub slabs: allocator::ArtMultiSlabStats,
|
||||
|
||||
pub epoch: u64,
|
||||
pub oldest_epoch: u64,
|
||||
pub num_garbage: u64,
|
||||
}
|
||||
243
libs/neonart/src/tests.rs
Normal file
243
libs/neonart/src/tests.rs
Normal file
@@ -0,0 +1,243 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::ArtAllocator;
|
||||
use crate::ArtMultiSlabAllocator;
|
||||
use crate::TreeInitStruct;
|
||||
use crate::TreeIterator;
|
||||
use crate::TreeWriteAccess;
|
||||
use crate::UpdateAction;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::Rng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl TestKey {
|
||||
const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
|
||||
const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
|
||||
}
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let w = tree_writer.start_write();
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx).as_ref());
|
||||
}
|
||||
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for TestValue {}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op<A: ArtAllocator<TestValue>>(
|
||||
op: &TestOp,
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
let w = tree.start_write();
|
||||
w.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
fn test_iter<A: ArtAllocator<TestValue>>(
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
let mut shadow_iter = shadow.iter();
|
||||
let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
|
||||
|
||||
loop {
|
||||
let shadow_item = shadow_iter.next().map(|(k, v)| (k.clone(), v.clone()));
|
||||
let r = tree.start_read();
|
||||
let item = iter.next(&r);
|
||||
|
||||
if shadow_item != item.map(|(k, v)| (k, v.load())) {
|
||||
eprintln!(
|
||||
"FAIL: iterator returned {:?}, expected {:?}",
|
||||
item, shadow_item
|
||||
);
|
||||
tree.start_read().dump(&mut std::io::stderr());
|
||||
|
||||
eprintln!("SHADOW:");
|
||||
let mut si = shadow.iter();
|
||||
while let Some(si) = si.next() {
|
||||
eprintln!("key: {:?}, val: {}", si.0, si.1);
|
||||
}
|
||||
panic!(
|
||||
"FAIL: iterator returned {:?}, expected {:?}",
|
||||
item, shadow_item
|
||||
);
|
||||
}
|
||||
if item.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let mut key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
if rng.random_bool(0.10) {
|
||||
key = TestKey::from(u128::from(&key) | 0xffffffff);
|
||||
}
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &tree_writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,7 +17,7 @@ anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
@@ -12,7 +12,6 @@ pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LI
|
||||
pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::num::{NonZeroU64, NonZeroUsize};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
@@ -25,9 +24,10 @@ use utils::logging::LogFormat;
|
||||
use crate::models::{ImageCompressionAlgorithm, LsnLease};
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not needed by the pageserver
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
|
||||
pub struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
@@ -46,23 +46,6 @@ pub struct NodeMetadata {
|
||||
pub other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl Display for NodeMetadata {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"postgresql://{}:{} ",
|
||||
self.postgres_host, self.postgres_port
|
||||
)?;
|
||||
if let Some(grpc_host) = &self.grpc_host {
|
||||
let grpc_port = self.grpc_port.unwrap_or_default();
|
||||
write!(f, "grpc://{grpc_host}:{grpc_port} ")?;
|
||||
}
|
||||
write!(f, "http://{}:{} ", self.http_host, self.http_port)?;
|
||||
write!(f, "other:{:?}", self.other)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// PostHog integration config.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PostHogConfig {
|
||||
@@ -356,21 +339,16 @@ pub struct TimelineImportConfig {
|
||||
pub struct BasebackupCacheConfig {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub cleanup_period: Duration,
|
||||
/// Maximum total size of basebackup cache entries on disk in bytes.
|
||||
/// The cache may slightly exceed this limit because we do not know
|
||||
/// the exact size of the cache entry untill it's written to disk.
|
||||
pub max_total_size_bytes: u64,
|
||||
// TODO(diko): support max_entry_size_bytes.
|
||||
// pub max_entry_size_bytes: u64,
|
||||
pub max_size_entries: usize,
|
||||
// FIXME: Support max_size_bytes.
|
||||
// pub max_size_bytes: usize,
|
||||
pub max_size_entries: i64,
|
||||
}
|
||||
|
||||
impl Default for BasebackupCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cleanup_period: Duration::from_secs(60),
|
||||
max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
// max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB
|
||||
// max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
max_size_entries: 1000,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ pub struct NodeRegisterRequest {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
@@ -103,6 +104,7 @@ pub struct TenantLocateResponseShard {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
@@ -350,35 +352,6 @@ impl Default for ShardSchedulingPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum NodeLifecycle {
|
||||
Active,
|
||||
Deleted,
|
||||
}
|
||||
|
||||
impl FromStr for NodeLifecycle {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"deleted" => Ok(Self::Deleted),
|
||||
_ => Err(anyhow::anyhow!("Unknown node lifecycle '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeLifecycle> for String {
|
||||
fn from(value: NodeLifecycle) -> String {
|
||||
use NodeLifecycle::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Deleted => "deleted",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::ops::Range;
|
||||
use anyhow::{Result, bail};
|
||||
use byteorder::{BE, ByteOrder};
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi_types::{Oid, RepOriginId};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, RepOriginId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::const_assert;
|
||||
|
||||
@@ -194,7 +194,7 @@ impl Key {
|
||||
/// will be rejected on the write path.
|
||||
#[allow(dead_code)]
|
||||
pub fn is_valid_key_on_write_path_strong(&self) -> bool {
|
||||
use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
if !self.is_i128_representable() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use itertools::Itertools;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
use crate::key::Key;
|
||||
use crate::shard::{ShardCount, ShardIdentity};
|
||||
@@ -268,13 +269,9 @@ impl KeySpace {
|
||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
||||
/// in each partition.
|
||||
///
|
||||
pub fn partition(
|
||||
&self,
|
||||
shard_identity: &ShardIdentity,
|
||||
target_size: u64,
|
||||
block_size: u64,
|
||||
) -> KeyPartitioning {
|
||||
let target_nblocks = (target_size / block_size) as u32;
|
||||
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
|
||||
// Assume that each value is 8k in size.
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as u32;
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
|
||||
@@ -5,10 +5,11 @@ pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod pagestream_api;
|
||||
pub mod record;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
pub mod value;
|
||||
|
||||
pub mod config;
|
||||
|
||||
@@ -5,12 +5,16 @@ pub mod utilization;
|
||||
use core::ops::Range;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::io::{BufRead, Read};
|
||||
use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize};
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
#[cfg(feature = "testing")]
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_with::serde_as;
|
||||
pub use utilization::PageserverUtilization;
|
||||
@@ -20,6 +24,7 @@ use utils::{completion, serde_system_time};
|
||||
|
||||
use crate::config::Ratio;
|
||||
use crate::key::{CompactKey, Key};
|
||||
use crate::reltag::RelTag;
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
@@ -1902,6 +1907,219 @@ pub struct ScanDisposableKeysResponse {
|
||||
pub not_disposable_count: usize,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(Debug, strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamFeMessageTag {
|
||||
Exists = 0,
|
||||
Nblocks = 1,
|
||||
GetPage = 2,
|
||||
DbSize = 3,
|
||||
GetSlruSegment = 4,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 99,
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamBeMessageTag {
|
||||
Exists = 100,
|
||||
Nblocks = 101,
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 199,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamFeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
0 => Ok(PagestreamFeMessageTag::Exists),
|
||||
1 => Ok(PagestreamFeMessageTag::Nblocks),
|
||||
2 => Ok(PagestreamFeMessageTag::GetPage),
|
||||
3 => Ok(PagestreamFeMessageTag::DbSize),
|
||||
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
99 => Ok(PagestreamFeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
100 => Ok(PagestreamBeMessageTag::Exists),
|
||||
101 => Ok(PagestreamBeMessageTag::Nblocks),
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
199 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A GetPage request contains two LSN values:
|
||||
//
|
||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
|
||||
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
|
||||
//
|
||||
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
|
||||
// modified between 'not_modified_since' and the request LSN. It's always correct to set
|
||||
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
|
||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||
// request without waiting for 'request_lsn' to arrive.
|
||||
//
|
||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||
// standby to request a page at a particular non-latest LSN, and also include the
|
||||
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
|
||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// V3 version of protocol adds request ID to all requests. This request ID is also included in response
|
||||
// as well as other fields from requests, which allows to verify that we receive response for our request.
|
||||
// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
|
||||
// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V2,
|
||||
V3,
|
||||
}
|
||||
|
||||
pub type RequestId = u64;
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamRequest {
|
||||
pub reqid: RequestId,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub kind: u8,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub req: PagestreamExistsRequest,
|
||||
pub exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksResponse {
|
||||
pub req: PagestreamNblocksRequest,
|
||||
pub n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageResponse {
|
||||
pub req: PagestreamGetPageRequest,
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub req: PagestreamGetSlruSegmentRequest,
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub req: PagestreamRequest,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeResponse {
|
||||
pub req: PagestreamDbSizeRequest,
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1913,6 +2131,506 @@ pub struct TenantHistorySize {
|
||||
pub size: Option<u64>,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
/// Serialize a compute -> pageserver message. This is currently only used in testing
|
||||
/// tools. Always uses protocol version 3.
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
bytes.put_u32(req.blkno);
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
let (reqid, request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
0,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V3 => (
|
||||
body.read_u64::<BigEndian>()?,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
};
|
||||
|
||||
match PagestreamFeMessageTag::try_from(msg_tag)
|
||||
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
|
||||
{
|
||||
PagestreamFeMessageTag::Exists => {
|
||||
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::Nblocks => {
|
||||
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetPage => {
|
||||
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::DbSize => {
|
||||
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
kind: body.read_u8()?,
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.req.blkno);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u64(resp.req.reqid);
|
||||
bytes.put_u64(resp.req.request_lsn.0);
|
||||
bytes.put_u64(resp.req.not_modified_since.0);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.dbnode);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(resp.req.kind);
|
||||
bytes.put_u32(resp.req.segno);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let exists = buf.read_u8()? != 0;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
req: PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
exists,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse {
|
||||
req: PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
n_blocks,
|
||||
})
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let blkno = buf.read_u32::<BigEndian>()?;
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
Self::GetPage(PagestreamGetPageResponse {
|
||||
req: PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
},
|
||||
page: page.into(),
|
||||
})
|
||||
}
|
||||
Tag::Error => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let mut msg = Vec::new();
|
||||
buf.read_until(0, &mut msg)?;
|
||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
||||
let rust_str = cstring.to_str()?;
|
||||
Self::Error(PagestreamErrorResponse {
|
||||
req: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let dbnode = buf.read_u32::<BigEndian>()?;
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse {
|
||||
req: PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode,
|
||||
},
|
||||
db_size,
|
||||
})
|
||||
}
|
||||
Tag::GetSlruSegment => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let kind = buf.read_u8()?;
|
||||
let segno = buf.read_u32::<BigEndian>()?;
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
|
||||
buf.read_exact(&mut segment)?;
|
||||
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
|
||||
req: PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
kind,
|
||||
segno,
|
||||
},
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
anyhow::bail!(
|
||||
"remaining bytes in msg with tag={msg_tag}: {}",
|
||||
remaining.len()
|
||||
);
|
||||
}
|
||||
Ok(ok)
|
||||
}
|
||||
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Exists(_) => "Exists",
|
||||
Self::Nblocks(_) => "Nblocks",
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PageTraceEvent {
|
||||
pub key: CompactKey,
|
||||
@@ -1938,6 +2656,68 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pagestream() {
|
||||
// Test serialization/deserialization of PagestreamFeMessage
|
||||
let messages = vec![
|
||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(4),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
blkno: 7,
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
dbnode: 7,
|
||||
}),
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
|
||||
.unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tenantinfo_serde() {
|
||||
// Test serialization/deserialization of TenantInfo
|
||||
|
||||
@@ -1,798 +0,0 @@
|
||||
//! Rust definitions of the libpq-based pagestream API
|
||||
//!
|
||||
//! See also the C implementation of the same API in pgxn/neon/pagestore_client.h
|
||||
|
||||
use std::io::{BufRead, Read};
|
||||
|
||||
use crate::reltag::RelTag;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Block size.
|
||||
///
|
||||
/// XXX: We assume 8k block size in the SLRU fetch API. It's not great to hardcode
|
||||
/// that in the protocol, because Postgres supports different block sizes as a compile
|
||||
/// time option.
|
||||
const BLCKSZ: usize = 8192;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(Debug, strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamFeMessageTag {
|
||||
Exists = 0,
|
||||
Nblocks = 1,
|
||||
GetPage = 2,
|
||||
DbSize = 3,
|
||||
GetSlruSegment = 4,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 99,
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamBeMessageTag {
|
||||
Exists = 100,
|
||||
Nblocks = 101,
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 199,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamFeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
0 => Ok(PagestreamFeMessageTag::Exists),
|
||||
1 => Ok(PagestreamFeMessageTag::Nblocks),
|
||||
2 => Ok(PagestreamFeMessageTag::GetPage),
|
||||
3 => Ok(PagestreamFeMessageTag::DbSize),
|
||||
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
99 => Ok(PagestreamFeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
100 => Ok(PagestreamBeMessageTag::Exists),
|
||||
101 => Ok(PagestreamBeMessageTag::Nblocks),
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
199 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A GetPage request contains two LSN values:
|
||||
//
|
||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
|
||||
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
|
||||
//
|
||||
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
|
||||
// modified between 'not_modified_since' and the request LSN. It's always correct to set
|
||||
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
|
||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||
// request without waiting for 'request_lsn' to arrive.
|
||||
//
|
||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||
// standby to request a page at a particular non-latest LSN, and also include the
|
||||
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
|
||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// V3 version of protocol adds request ID to all requests. This request ID is also included in response
|
||||
// as well as other fields from requests, which allows to verify that we receive response for our request.
|
||||
// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
|
||||
// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V2,
|
||||
V3,
|
||||
}
|
||||
|
||||
pub type RequestId = u64;
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamRequest {
|
||||
pub reqid: RequestId,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub kind: u8,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub req: PagestreamExistsRequest,
|
||||
pub exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksResponse {
|
||||
pub req: PagestreamNblocksRequest,
|
||||
pub n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageResponse {
|
||||
pub req: PagestreamGetPageRequest,
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub req: PagestreamGetSlruSegmentRequest,
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub req: PagestreamRequest,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeResponse {
|
||||
pub req: PagestreamDbSizeRequest,
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
/// Serialize a compute -> pageserver message. This is currently only used in testing
|
||||
/// tools. Always uses protocol version 3.
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
bytes.put_u32(req.blkno);
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
let (reqid, request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
0,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V3 => (
|
||||
body.read_u64::<BigEndian>()?,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
};
|
||||
|
||||
match PagestreamFeMessageTag::try_from(msg_tag)
|
||||
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
|
||||
{
|
||||
PagestreamFeMessageTag::Exists => {
|
||||
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::Nblocks => {
|
||||
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetPage => {
|
||||
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::DbSize => {
|
||||
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
kind: body.read_u8()?,
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.req.blkno);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u64(resp.req.reqid);
|
||||
bytes.put_u64(resp.req.request_lsn.0);
|
||||
bytes.put_u64(resp.req.not_modified_since.0);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.dbnode);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(resp.req.kind);
|
||||
bytes.put_u32(resp.req.segno);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let exists = buf.read_u8()? != 0;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
req: PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
exists,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse {
|
||||
req: PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
n_blocks,
|
||||
})
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let blkno = buf.read_u32::<BigEndian>()?;
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
Self::GetPage(PagestreamGetPageResponse {
|
||||
req: PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
},
|
||||
page: page.into(),
|
||||
})
|
||||
}
|
||||
Tag::Error => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let mut msg = Vec::new();
|
||||
buf.read_until(0, &mut msg)?;
|
||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
||||
let rust_str = cstring.to_str()?;
|
||||
Self::Error(PagestreamErrorResponse {
|
||||
req: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let dbnode = buf.read_u32::<BigEndian>()?;
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse {
|
||||
req: PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode,
|
||||
},
|
||||
db_size,
|
||||
})
|
||||
}
|
||||
Tag::GetSlruSegment => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let kind = buf.read_u8()?;
|
||||
let segno = buf.read_u32::<BigEndian>()?;
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
let mut segment = vec![0; n_blocks as usize * BLCKSZ];
|
||||
buf.read_exact(&mut segment)?;
|
||||
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
|
||||
req: PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
kind,
|
||||
segno,
|
||||
},
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
anyhow::bail!(
|
||||
"remaining bytes in msg with tag={msg_tag}: {}",
|
||||
remaining.len()
|
||||
);
|
||||
}
|
||||
Ok(ok)
|
||||
}
|
||||
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Exists(_) => "Exists",
|
||||
Self::Nblocks(_) => "Nblocks",
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pagestream() {
|
||||
// Test serialization/deserialization of PagestreamFeMessage
|
||||
let messages = vec![
|
||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(4),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
blkno: 7,
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
dbnode: 7,
|
||||
}),
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
|
||||
.unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi_types::Oid;
|
||||
use postgres_ffi_types::constants::GLOBALTABLESPACE_OID;
|
||||
use postgres_ffi_types::forknum::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
///
|
||||
|
||||
@@ -35,7 +35,7 @@ use std::hash::{Hash, Hasher};
|
||||
|
||||
#[doc(inline)]
|
||||
pub use ::utils::shard::*;
|
||||
use postgres_ffi_types::forknum::INIT_FORKNUM;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::key::Key;
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::id::{NodeId, TimelineId};
|
||||
|
||||
use crate::controller_api::NodeRegisterRequest;
|
||||
use crate::models::{LocationConfigMode, ShardImportStatus};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
use crate::shard::TenantShardId;
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
@@ -36,10 +36,6 @@ pub struct ReAttachResponseTenant {
|
||||
/// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "default_mode")]
|
||||
pub mode: LocationConfigMode,
|
||||
|
||||
// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "ShardStripeSize::default")]
|
||||
pub stripe_size: ShardStripeSize,
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::models::record::NeonWalRecord;
|
||||
use crate::record::NeonWalRecord;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
@@ -16,7 +16,6 @@ memoffset.workspace = true
|
||||
pprof.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
|
||||
@@ -11,7 +11,11 @@
|
||||
|
||||
use crate::{BLCKSZ, PageHeaderData};
|
||||
|
||||
// Note: There are a few more widely-used constants in the postgres_ffi_types::constants crate.
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const XLOG_SMGR_CREATE: u8 = 0x10;
|
||||
|
||||
@@ -4,7 +4,50 @@
|
||||
use once_cell::sync::OnceCell;
|
||||
use regex::Regex;
|
||||
|
||||
use postgres_ffi_types::forknum::*;
|
||||
//
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
InvalidForkName,
|
||||
#[error("invalid relation data file name")]
|
||||
InvalidFileName,
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(_e: core::num::ParseIntError) -> Self {
|
||||
FilePathError::InvalidFileName
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres relation file's fork suffix to fork number.
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(FSM_FORKNUM),
|
||||
Some("vm") => Ok(VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
MAIN_FORKNUM => None,
|
||||
FSM_FORKNUM => Some("fsm"),
|
||||
VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
INIT_FORKNUM => Some("init"),
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
@@ -32,9 +75,7 @@ pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
.ok_or(FilePathError::InvalidFileName)?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = relnode_str
|
||||
.parse::<u32>()
|
||||
.map_err(|_e| FilePathError::InvalidFileName)?;
|
||||
let relnode = relnode_str.parse::<u32>()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_number(forkname)?;
|
||||
@@ -43,11 +84,7 @@ pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match
|
||||
.unwrap()
|
||||
.as_str()
|
||||
.parse::<u32>()
|
||||
.map_err(|_e| FilePathError::InvalidFileName)?
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
[package]
|
||||
name = "postgres_ffi_types"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
@@ -1,8 +0,0 @@
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
//! Any constants included here must be the same in all PostgreSQL versions and unlikely to change
|
||||
//! in the future either!
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
@@ -1,36 +0,0 @@
|
||||
// Fork numbers, from relpath.h
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
InvalidForkName,
|
||||
#[error("invalid relation data file name")]
|
||||
InvalidFileName,
|
||||
}
|
||||
|
||||
/// Convert Postgres relation file's fork suffix to fork number.
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(FSM_FORKNUM),
|
||||
Some("vm") => Ok(VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
MAIN_FORKNUM => None,
|
||||
FSM_FORKNUM => Some("fsm"),
|
||||
VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
INIT_FORKNUM => Some("init"),
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
//! This package contains some PostgreSQL constants and datatypes that are the same in all versions
|
||||
//! of PostgreSQL and unlikely to change in the future either. These could be derived from the
|
||||
//! PostgreSQL headers with 'bindgen', but in order to avoid proliferating the dependency to bindgen
|
||||
//! and the PostgreSQL C headers to all services, we prefer to have this small stand-alone crate for
|
||||
//! them instead.
|
||||
//!
|
||||
//! Be mindful in what you add here, as these types are deeply ingrained in the APIs.
|
||||
|
||||
pub mod constants;
|
||||
pub mod forknum;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type RepOriginId = u16;
|
||||
@@ -55,16 +55,9 @@ impl FeatureResolverBackgroundLoop {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
|
||||
match FeatureStore::new_with_flags(resp.flags, project_id) {
|
||||
Ok(feature_store) => {
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot process feature flag spec: {}", e);
|
||||
}
|
||||
}
|
||||
let feature_store = FeatureStore::new_with_flags(resp.flags);
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
}
|
||||
|
||||
@@ -39,9 +39,6 @@ pub struct LocalEvaluationResponse {
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct LocalEvaluationFlag {
|
||||
#[allow(dead_code)]
|
||||
id: u64,
|
||||
team_id: u64,
|
||||
key: String,
|
||||
filters: LocalEvaluationFlagFilters,
|
||||
active: bool,
|
||||
@@ -110,32 +107,17 @@ impl FeatureStore {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_with_flags(
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<Self, &'static str> {
|
||||
pub fn new_with_flags(flags: Vec<LocalEvaluationFlag>) -> Self {
|
||||
let mut store = Self::new();
|
||||
store.set_flags(flags, project_id)?;
|
||||
Ok(store)
|
||||
store.set_flags(flags);
|
||||
store
|
||||
}
|
||||
|
||||
pub fn set_flags(
|
||||
&mut self,
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<(), &'static str> {
|
||||
pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
|
||||
self.flags.clear();
|
||||
for flag in flags {
|
||||
if let Some(project_id) = project_id {
|
||||
if flag.team_id != project_id {
|
||||
return Err(
|
||||
"Retrieved a spec with different project id, wrong config? Discarding the feature flags.",
|
||||
);
|
||||
}
|
||||
}
|
||||
self.flags.insert(flag.key.clone(), flag);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate a consistent hash for a user ID (e.g., tenant ID).
|
||||
@@ -552,13 +534,6 @@ impl PostHogClient {
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if the server API key is a feature flag secure API key. This key can only be
|
||||
/// used to fetch the feature flag specs and can only be used on a undocumented API
|
||||
/// endpoint.
|
||||
fn is_feature_flag_secure_api_key(&self) -> bool {
|
||||
self.config.server_api_key.starts_with("phs_")
|
||||
}
|
||||
|
||||
/// Fetch the feature flag specs from the server.
|
||||
///
|
||||
/// This is unfortunately an undocumented API at:
|
||||
@@ -572,22 +547,10 @@ impl PostHogClient {
|
||||
) -> anyhow::Result<LocalEvaluationResponse> {
|
||||
// BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
|
||||
// with bearer token of self.server_api_key
|
||||
// OR
|
||||
// BASE_URL/api/feature_flag/local_evaluation/
|
||||
// with bearer token of feature flag specific self.server_api_key
|
||||
let url = if self.is_feature_flag_secure_api_key() {
|
||||
// The new feature local evaluation secure API token
|
||||
format!(
|
||||
"{}/api/feature_flag/local_evaluation",
|
||||
self.config.private_api_url
|
||||
)
|
||||
} else {
|
||||
// The old personal API token
|
||||
format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
)
|
||||
};
|
||||
let url = format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
);
|
||||
let response = self
|
||||
.client
|
||||
.get(url)
|
||||
@@ -840,7 +803,7 @@ mod tests {
|
||||
fn evaluate_multivariate() {
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
@@ -910,7 +873,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new());
|
||||
@@ -966,7 +929,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2024"
|
||||
license = "MIT/Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
base64.workspace = true
|
||||
base64 = "0.20"
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
use std::fmt::Write;
|
||||
use std::{io, iter, mem, str};
|
||||
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use hmac::{Hmac, Mac};
|
||||
use rand::{self, Rng};
|
||||
use sha2::digest::FixedOutput;
|
||||
@@ -228,7 +226,7 @@ impl ScramSha256 {
|
||||
|
||||
let (client_key, server_key) = match password {
|
||||
Credentials::Password(password) => {
|
||||
let salt = match BASE64_STANDARD.decode(parsed.salt) {
|
||||
let salt = match base64::decode(parsed.salt) {
|
||||
Ok(salt) => salt,
|
||||
Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
|
||||
};
|
||||
@@ -257,7 +255,7 @@ impl ScramSha256 {
|
||||
let mut cbind_input = vec![];
|
||||
cbind_input.extend(channel_binding.gs2_header().as_bytes());
|
||||
cbind_input.extend(channel_binding.cbind_data());
|
||||
let cbind_input = BASE64_STANDARD.encode(&cbind_input);
|
||||
let cbind_input = base64::encode(&cbind_input);
|
||||
|
||||
self.message.clear();
|
||||
write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap();
|
||||
@@ -274,12 +272,7 @@ impl ScramSha256 {
|
||||
*proof ^= signature;
|
||||
}
|
||||
|
||||
write!(
|
||||
&mut self.message,
|
||||
",p={}",
|
||||
BASE64_STANDARD.encode(client_proof)
|
||||
)
|
||||
.unwrap();
|
||||
write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap();
|
||||
|
||||
self.state = State::Finish {
|
||||
server_key,
|
||||
@@ -313,7 +306,7 @@ impl ScramSha256 {
|
||||
ServerFinalMessage::Verifier(verifier) => verifier,
|
||||
};
|
||||
|
||||
let verifier = match BASE64_STANDARD.decode(verifier) {
|
||||
let verifier = match base64::decode(verifier) {
|
||||
Ok(verifier) => verifier,
|
||||
Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
|
||||
};
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
//! side. This is good because it ensures the cleartext password won't
|
||||
//! end up in logs pg_stat displays, etc.
|
||||
|
||||
use base64::Engine as _;
|
||||
use base64::prelude::BASE64_STANDARD;
|
||||
use hmac::{Hmac, Mac};
|
||||
use rand::RngCore;
|
||||
use sha2::digest::FixedOutput;
|
||||
@@ -85,8 +83,8 @@ pub(crate) async fn scram_sha_256_salt(
|
||||
format!(
|
||||
"SCRAM-SHA-256${}:{}${}:{}",
|
||||
SCRAM_DEFAULT_ITERATIONS,
|
||||
BASE64_STANDARD.encode(salt),
|
||||
BASE64_STANDARD.encode(stored_key),
|
||||
BASE64_STANDARD.encode(server_key)
|
||||
base64::encode(salt),
|
||||
base64::encode(stored_key),
|
||||
base64::encode(server_key)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::{Error, cancel_query_raw, connect_socket};
|
||||
pub(crate) async fn cancel_query<T>(
|
||||
config: Option<SocketConfig>,
|
||||
ssl_mode: SslMode,
|
||||
tls: T,
|
||||
mut tls: T,
|
||||
process_id: i32,
|
||||
secret_key: i32,
|
||||
) -> Result<(), Error>
|
||||
|
||||
@@ -17,6 +17,7 @@ use crate::{Client, Connection, Error};
|
||||
|
||||
/// TLS configuration.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub enum SslMode {
|
||||
/// Do not use TLS.
|
||||
Disable,
|
||||
@@ -230,7 +231,7 @@ impl Config {
|
||||
/// Requires the `runtime` Cargo feature (enabled by default).
|
||||
pub async fn connect<T>(
|
||||
&self,
|
||||
tls: &T,
|
||||
tls: T,
|
||||
) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
|
||||
where
|
||||
T: MakeTlsConnect<TcpStream>,
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::{Client, Config, Connection, Error, RawConnection};
|
||||
|
||||
pub async fn connect<T>(
|
||||
tls: &T,
|
||||
mut tls: T,
|
||||
config: &Config,
|
||||
) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
|
||||
where
|
||||
|
||||
@@ -47,7 +47,7 @@ pub trait MakeTlsConnect<S> {
|
||||
/// Creates a new `TlsConnect`or.
|
||||
///
|
||||
/// The domain name is provided for certificate verification and SNI.
|
||||
fn make_tls_connect(&self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
|
||||
fn make_tls_connect(&mut self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
|
||||
}
|
||||
|
||||
/// An asynchronous function wrapping a stream in a TLS session.
|
||||
@@ -85,7 +85,7 @@ impl<S> MakeTlsConnect<S> for NoTls {
|
||||
type TlsConnect = NoTls;
|
||||
type Error = NoTlsError;
|
||||
|
||||
fn make_tls_connect(&self, _: &str) -> Result<NoTls, NoTlsError> {
|
||||
fn make_tls_connect(&mut self, _: &str) -> Result<NoTls, NoTlsError> {
|
||||
Ok(NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::{env, io};
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use anyhow::{Context, Result};
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
|
||||
use azure_storage::StorageCredentials;
|
||||
@@ -37,7 +37,6 @@ use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
|
||||
ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
Version, VersionKind,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -406,39 +405,6 @@ impl AzureBlobStorage {
|
||||
pub fn container_name(&self) -> &str {
|
||||
&self.container_name
|
||||
}
|
||||
|
||||
async fn list_versions_with_permit(
|
||||
&self,
|
||||
_permit: &tokio::sync::SemaphorePermit<'_>,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::VersionListing, DownloadError> {
|
||||
let customize_builder = |mut builder: ListBlobsBuilder| {
|
||||
builder = builder.include_versions(true);
|
||||
// We do not return this info back to `VersionListing` yet.
|
||||
builder = builder.include_deleted(true);
|
||||
builder
|
||||
};
|
||||
let kind = RequestKind::ListVersions;
|
||||
|
||||
let mut stream = std::pin::pin!(self.list_streaming_for_fn(
|
||||
prefix,
|
||||
mode,
|
||||
max_keys,
|
||||
cancel,
|
||||
kind,
|
||||
customize_builder
|
||||
));
|
||||
let mut combined: crate::VersionListing =
|
||||
stream.next().await.expect("At least one item required")?;
|
||||
while let Some(list) = stream.next().await {
|
||||
let list = list?;
|
||||
combined.versions.extend(list.versions.into_iter());
|
||||
}
|
||||
Ok(combined)
|
||||
}
|
||||
}
|
||||
|
||||
trait ListingCollector {
|
||||
@@ -522,10 +488,27 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> std::result::Result<crate::VersionListing, DownloadError> {
|
||||
let customize_builder = |mut builder: ListBlobsBuilder| {
|
||||
builder = builder.include_versions(true);
|
||||
builder
|
||||
};
|
||||
let kind = RequestKind::ListVersions;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
|
||||
.await
|
||||
|
||||
let mut stream = std::pin::pin!(self.list_streaming_for_fn(
|
||||
prefix,
|
||||
mode,
|
||||
max_keys,
|
||||
cancel,
|
||||
kind,
|
||||
customize_builder
|
||||
));
|
||||
let mut combined: crate::VersionListing =
|
||||
stream.next().await.expect("At least one item required")?;
|
||||
while let Some(list) = stream.next().await {
|
||||
let list = list?;
|
||||
combined.versions.extend(list.versions.into_iter());
|
||||
}
|
||||
Ok(combined)
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
@@ -820,159 +803,14 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
_complexity_limit: Option<NonZeroU32>,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected "
|
||||
.to_string()
|
||||
+ "for some specific files. If a file gets deleted but then overwritten and we want to recover "
|
||||
+ "to the time during the file was not present, this functionality will recover the file. Only "
|
||||
+ "use the functionality for services that can tolerate this. For example, recovering a state of the "
|
||||
+ "pageserver tenants.";
|
||||
tracing::error!("{}", msg);
|
||||
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mode = ListingMode::NoDelimiter;
|
||||
let version_listing = self
|
||||
.list_versions_with_permit(&permit, prefix, mode, None, cancel)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::Other(e) => TimeTravelError::Other(e),
|
||||
DownloadError::Cancelled => TimeTravelError::Cancelled,
|
||||
other => TimeTravelError::Other(other.into()),
|
||||
})?;
|
||||
let versions_and_deletes = version_listing.versions;
|
||||
|
||||
tracing::info!(
|
||||
"Built list for time travel with {} versions and deletions",
|
||||
versions_and_deletes.len()
|
||||
);
|
||||
|
||||
// Work on the list of references instead of the objects directly,
|
||||
// otherwise we get lifetime errors in the sort_by_key call below.
|
||||
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
|
||||
|
||||
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in &versions_and_deletes {
|
||||
let Version { key, .. } = &vd;
|
||||
let version_id = vd.version_id().map(|v| v.0.as_str());
|
||||
if version_id == Some("null") {
|
||||
return Err(TimeTravelError::Other(anyhow!(
|
||||
"Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values"
|
||||
)));
|
||||
}
|
||||
tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);
|
||||
|
||||
vds_for_key.entry(key).or_default().push(vd);
|
||||
}
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
|
||||
|
||||
for (key, versions) in vds_for_key {
|
||||
let last_vd = versions.last().unwrap();
|
||||
let key = self.relative_path_to_name(key);
|
||||
if last_vd.last_modified > done_if_after {
|
||||
tracing::debug!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::debug!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::debug!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
Version {
|
||||
kind: VersionKind::Version(version_id),
|
||||
..
|
||||
} => {
|
||||
let source_url = format!(
|
||||
"{}/{}?versionid={}",
|
||||
self.client
|
||||
.url()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow!("{e}")))?,
|
||||
key,
|
||||
version_id.0
|
||||
);
|
||||
tracing::debug!(
|
||||
"Promoting old version {} for {key} at {}...",
|
||||
version_id.0,
|
||||
source_url
|
||||
);
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let blob_client = self.client.blob_client(key.clone());
|
||||
let op = blob_client.copy(Url::from_str(&source_url).unwrap());
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"copying object version for time_travel_recover",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| TimeTravelError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
tracing::info!(?version_id, %key, "Copied old version in Azure blob storage");
|
||||
}
|
||||
Version {
|
||||
kind: VersionKind::DeletionMarker,
|
||||
..
|
||||
} => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd.kind, VersionKind::DeletionMarker) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::debug!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::debug!("Deleting {key}...");
|
||||
|
||||
self.delete(&RemotePath::from_string(&key).unwrap(), cancel)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
// delete_oid0 will use TimeoutOrCancel
|
||||
if TimeoutOrCancel::caused_by_cancel(&e) {
|
||||
TimeTravelError::Cancelled
|
||||
} else {
|
||||
TimeTravelError::Other(e)
|
||||
}
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user