Compare commits
230 Commits
problame/f
...
erik/commu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f4d51c0f5c | ||
|
|
ec17ae0658 | ||
|
|
9ecce60ded | ||
|
|
e74a957045 | ||
|
|
396a16a3b2 | ||
|
|
7140a50225 | ||
|
|
68f18ccacf | ||
|
|
786888d93f | ||
|
|
255537dda1 | ||
|
|
8b494f6a24 | ||
|
|
28a61741b3 | ||
|
|
2fb6164bf8 | ||
|
|
328f28dfe5 | ||
|
|
95838056da | ||
|
|
6d451654f1 | ||
|
|
37c58522a2 | ||
|
|
6ae4b89000 | ||
|
|
f7ec7668a2 | ||
|
|
038e967daf | ||
|
|
6a43f23eca | ||
|
|
868f194a3b | ||
|
|
9c6c780201 | ||
|
|
6123fe2d5e | ||
|
|
1577665c20 | ||
|
|
d8ebd1d771 | ||
|
|
c8a96cf722 | ||
|
|
56d505bce6 | ||
|
|
dae203ef69 | ||
|
|
1fb1315aed | ||
|
|
838622c594 | ||
|
|
3fd5a94a85 | ||
|
|
e7d6f525b3 | ||
|
|
e4ca3ac745 | ||
|
|
b69d103b90 | ||
|
|
208cbd52d4 | ||
|
|
4b6f02e47d | ||
|
|
c567ed0de0 | ||
|
|
c698cee19a | ||
|
|
4a3f32bf4a | ||
|
|
a963aab14b | ||
|
|
8202c6172f | ||
|
|
5bdba70f7d | ||
|
|
25fffd3a55 | ||
|
|
e00fd45bba | ||
|
|
69a47d789d | ||
|
|
b36f880710 | ||
|
|
745b750f33 | ||
|
|
3b8be98b67 | ||
|
|
3e72edede5 | ||
|
|
a650f7f5af | ||
|
|
fc3994eb71 | ||
|
|
781bf4945d | ||
|
|
a21c1174ed | ||
|
|
8d7ed2a4ee | ||
|
|
5b62749c42 | ||
|
|
af5bb67f08 | ||
|
|
589bfdfd02 | ||
|
|
87179e26b3 | ||
|
|
f05df409bd | ||
|
|
f6c0f6c4ec | ||
|
|
62cd3b8d3d | ||
|
|
8d26978ed9 | ||
|
|
35372a8f12 | ||
|
|
6d95a3fe2d | ||
|
|
99726495c7 | ||
|
|
4a4a457312 | ||
|
|
e78d1e2ec6 | ||
|
|
af429b4a62 | ||
|
|
3b4d4eb535 | ||
|
|
f060537a31 | ||
|
|
8a6fc6fd8c | ||
|
|
f06bb2bbd8 | ||
|
|
51639cd6af | ||
|
|
529d661532 | ||
|
|
9e4cf52949 | ||
|
|
b3c25418a6 | ||
|
|
33549bad1d | ||
|
|
831f2a4ba7 | ||
|
|
eadabeddb8 | ||
|
|
009168d711 | ||
|
|
67ddf1de28 | ||
|
|
541fcd8d2f | ||
|
|
e77961c1c6 | ||
|
|
cdfa06caad | ||
|
|
f0bb93a9c9 | ||
|
|
30adf8e2bd | ||
|
|
5d538a9503 | ||
|
|
f3976e5c60 | ||
|
|
9657fbc194 | ||
|
|
dd501554c9 | ||
|
|
fe1513ca57 | ||
|
|
3e86008e66 | ||
|
|
23fc611461 | ||
|
|
7c9bd542a6 | ||
|
|
dc953de85d | ||
|
|
014823b305 | ||
|
|
af9379ccf6 | ||
|
|
841517ee37 | ||
|
|
1369d73dcd | ||
|
|
7cd0defaf0 | ||
|
|
a082f9814a | ||
|
|
bb28109ffa | ||
|
|
ec991877f4 | ||
|
|
abc6c84262 | ||
|
|
6768a71c86 | ||
|
|
87fc0a0374 | ||
|
|
06ce704041 | ||
|
|
d5023f2b89 | ||
|
|
8ff25dca8e | ||
|
|
cf81330fbc | ||
|
|
e69ae739ff | ||
|
|
136eaeb74a | ||
|
|
211b824d62 | ||
|
|
f9fdbc9618 | ||
|
|
95a5f749c8 | ||
|
|
5db20af8a7 | ||
|
|
136cf1979b | ||
|
|
08bb72e516 | ||
|
|
6f4f3691a5 | ||
|
|
a2b756843e | ||
|
|
f3c9d0adf4 | ||
|
|
2e3dc9a8c2 | ||
|
|
568779fa8a | ||
|
|
e94acbc816 | ||
|
|
f4150614d0 | ||
|
|
60a0bec1c0 | ||
|
|
31fa7a545d | ||
|
|
ac464c5f2c | ||
|
|
0dddb1e373 | ||
|
|
3acb263e62 | ||
|
|
38dbc5f67f | ||
|
|
3685ad606d | ||
|
|
76a7d37f7e | ||
|
|
cdb6479c8a | ||
|
|
81c557d87e | ||
|
|
e963129678 | ||
|
|
4f0a9fc569 | ||
|
|
81c6a5a796 | ||
|
|
8e05639dbf | ||
|
|
deed46015d | ||
|
|
532d9b646e | ||
|
|
55f91cf10b | ||
|
|
baafcc5d41 | ||
|
|
aa22572d8c | ||
|
|
2d247375b3 | ||
|
|
a7ce323949 | ||
|
|
31026d5a3c | ||
|
|
2621ce2daf | ||
|
|
a703cd342b | ||
|
|
42e4cf18c9 | ||
|
|
9e5a41a342 | ||
|
|
48b870bc07 | ||
|
|
32a12783fd | ||
|
|
1e83398cdd | ||
|
|
be8ed81532 | ||
|
|
68120cfa31 | ||
|
|
a8e652d47e | ||
|
|
81fd652151 | ||
|
|
d47e88e353 | ||
|
|
12b08c4b82 | ||
|
|
045ae13e06 | ||
|
|
234c882a07 | ||
|
|
290369061f | ||
|
|
25ab16ee24 | ||
|
|
cfbef4d586 | ||
|
|
34a42b00ca | ||
|
|
a9979620c5 | ||
|
|
a113c48c43 | ||
|
|
827358dd03 | ||
|
|
d367273000 | ||
|
|
e2bad5d9e9 | ||
|
|
9971fba584 | ||
|
|
a77919f4b2 | ||
|
|
5623e4665b | ||
|
|
a618056770 | ||
|
|
307e1e64c8 | ||
|
|
a537b2ffd0 | ||
|
|
8abb4dab6d | ||
|
|
731667ac37 | ||
|
|
6a1374d106 | ||
|
|
f7c908f2f0 | ||
|
|
86671e3a0b | ||
|
|
319cd74f73 | ||
|
|
64353b48db | ||
|
|
79ddc803af | ||
|
|
0efefbf77c | ||
|
|
e6a4171fa1 | ||
|
|
f5070f6aa4 | ||
|
|
3b7cc4234c | ||
|
|
0c25ea9e31 | ||
|
|
33abfc2b74 | ||
|
|
93b964f829 | ||
|
|
d0aaec2abb | ||
|
|
d0dc65da12 | ||
|
|
03d635b916 | ||
|
|
5cd7f936f9 | ||
|
|
101e115b38 | ||
|
|
b37bb7d7ed | ||
|
|
6692321026 | ||
|
|
791df28755 | ||
|
|
d20da994f4 | ||
|
|
6dbbdaae73 | ||
|
|
977bc09d2a | ||
|
|
44269fcd5e | ||
|
|
44cc648dc8 | ||
|
|
884e028a4a | ||
|
|
42df3e5453 | ||
|
|
fc743e284f | ||
|
|
d02f9a2139 | ||
|
|
083118e98e | ||
|
|
54cd2272f1 | ||
|
|
e40193e3c8 | ||
|
|
ce9f7bacc1 | ||
|
|
b7891f8fe8 | ||
|
|
5f2adaa9ad | ||
|
|
3e5e396c8d | ||
|
|
9d781c6fda | ||
|
|
cf5d038472 | ||
|
|
d785100c02 | ||
|
|
2c0d930e3d | ||
|
|
66171a117b | ||
|
|
df2806e7a0 | ||
|
|
07631692db | ||
|
|
4c77397943 | ||
|
|
7bb58be546 | ||
|
|
b5373de208 | ||
|
|
b86c610f42 | ||
|
|
0f520d79ab | ||
|
|
93eb7bb6b8 | ||
|
|
e58d0fece1 |
20
.github/actions/neon-project-create/action.yml
vendored
@@ -49,10 +49,6 @@ inputs:
|
||||
description: 'A JSON object with project settings'
|
||||
required: false
|
||||
default: '{}'
|
||||
default_endpoint_settings:
|
||||
description: 'A JSON object with the default endpoint settings'
|
||||
required: false
|
||||
default: '{}'
|
||||
|
||||
outputs:
|
||||
dsn:
|
||||
@@ -139,21 +135,6 @@ runs:
|
||||
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||
-d "{\"scheduling\": \"Essential\"}"
|
||||
fi
|
||||
# XXX
|
||||
# This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API.
|
||||
# https://github.com/neondatabase/cloud/issues/27108
|
||||
if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then
|
||||
PROJECT_DATA=$(curl -X GET \
|
||||
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \
|
||||
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||
-d "{\"scheduling\": \"Essential\"}"
|
||||
)
|
||||
NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}")
|
||||
curl -X POST --fail \
|
||||
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \
|
||||
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||
--data "${NEW_DEFAULT_ENDPOINT_SETTINGS}"
|
||||
fi
|
||||
|
||||
|
||||
env:
|
||||
@@ -171,4 +152,3 @@ runs:
|
||||
PSQL: ${{ inputs.psql_path }}
|
||||
LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
|
||||
PROJECT_SETTINGS: ${{ inputs.project_settings }}
|
||||
DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }}
|
||||
|
||||
22
.github/workflows/_build-and-test-locally.yml
vendored
@@ -279,18 +279,14 @@ jobs:
|
||||
# run all non-pageserver tests
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for get_vectored_concurrent_io in sequential sidecar-task; do
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
for io_mode in buffered direct direct-rw ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \
|
||||
${cov_prefix} \
|
||||
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
done
|
||||
done
|
||||
# run pageserver tests
|
||||
# (When developing new pageserver features gated by config fields, we commonly make the rust
|
||||
# unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME.
|
||||
# Then run the nextest invocation below for all relevant combinations. Singling out the
|
||||
# pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \
|
||||
${cov_prefix} \
|
||||
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
@@ -405,8 +401,6 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
|
||||
7
.github/workflows/build_and_test.yml
vendored
@@ -314,7 +314,8 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
# test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
pg_version: v16
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
@@ -323,8 +324,6 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
|
||||
SYNC_BETWEEN_TESTS: true
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
@@ -965,7 +964,7 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 20
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
TAG: >-
|
||||
${{
|
||||
|
||||
15
.github/workflows/cloud-extensions.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
matrix:
|
||||
pg-version: [16, 17]
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
# We use the neon-test-extensions image here as it contains the source code for the extensions.
|
||||
image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest
|
||||
@@ -71,20 +71,7 @@ jobs:
|
||||
region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ matrix.pg-version }}
|
||||
project_settings: ${{ steps.project-settings.outputs.settings }}
|
||||
# We need these settings to get the expected output results.
|
||||
# We cannot use the environment variables e.g. PGTZ due to
|
||||
# https://github.com/neondatabase/neon/issues/1287
|
||||
default_endpoint_settings: >
|
||||
{
|
||||
"pg_settings": {
|
||||
"DateStyle": "Postgres,MDY",
|
||||
"TimeZone": "America/Los_Angeles",
|
||||
"compute_query_id": "off",
|
||||
"neon.allow_unstable_extensions": "on"
|
||||
}
|
||||
}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
|
||||
|
||||
- name: Run the regression tests
|
||||
run: /run-tests.sh -r /ext-src
|
||||
|
||||
4
.github/workflows/neon_extra_builds.yml
vendored
@@ -63,8 +63,10 @@ jobs:
|
||||
|
||||
- name: Filter out only v-string for build matrix
|
||||
id: postgres_changes
|
||||
env:
|
||||
CHANGES: ${{ steps.files_changed.outputs.changes }}
|
||||
run: |
|
||||
v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
|
||||
v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
|
||||
echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
|
||||
|
||||
check-macos-build:
|
||||
|
||||
281
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||
name: Periodic pagebench performance test on unit-perf hetzner runner
|
||||
|
||||
on:
|
||||
schedule:
|
||||
@@ -8,7 +8,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 */3 * * *' # Runs every 3 hours
|
||||
- cron: '0 */4 * * *' # Runs every 4 hours
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
@@ -16,6 +16,11 @@ on:
|
||||
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
||||
required: false
|
||||
default: ''
|
||||
recreate_snapshots:
|
||||
type: boolean
|
||||
description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -29,13 +34,13 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||
run_periodic_pagebench_test:
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, small ]
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
@@ -44,10 +49,13 @@ jobs:
|
||||
options: --init
|
||||
timeout-minutes: 360 # Set the timeout to 6 hours
|
||||
env:
|
||||
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
AWS_DEFAULT_REGION : "eu-central-1"
|
||||
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||
DEFAULT_PG_VERSION: 16
|
||||
BUILD_TYPE: release
|
||||
RUST_BACKTRACE: 1
|
||||
# NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
|
||||
S3_BUCKET: neon-github-public-dev
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
steps:
|
||||
# we don't need the neon source code because we run everything remotely
|
||||
# however we still need the local github actions to run the allure step below
|
||||
@@ -56,99 +64,194 @@ jobs:
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
|
||||
id: set-env
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
{
|
||||
echo "NEON_DIR=${RUNNER_TEMP}/neon"
|
||||
echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
|
||||
echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
|
||||
echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
|
||||
echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
|
||||
echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
|
||||
echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
|
||||
echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
|
||||
echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||
run: curl https://ifconfig.me
|
||||
echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
- uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Start EC2 instance and wait for the instance to boot up
|
||||
run: |
|
||||
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
||||
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
||||
|
||||
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
||||
run: |
|
||||
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
||||
echo "Public IP of the EC2 instance: $public_ip"
|
||||
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
||||
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
|
||||
- name: Determine commit hash
|
||||
id: commit_hash
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
|
||||
COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
|
||||
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
COMMIT_HASH="${INPUT_COMMIT_HASH}"
|
||||
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Checkout the neon repository at given commit hash
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
ref: ${{ steps.commit_hash.outputs.commit_hash }}
|
||||
|
||||
- name: Start Bench with run_id
|
||||
# does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
|
||||
# example artifact
|
||||
# s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
|
||||
- name: Determine artifact S3_KEY for given commit hash and download and extract artifact
|
||||
id: artifact_prefix
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
||||
COMMIT_HASH: ${{ env.COMMIT_HASH }}
|
||||
COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
|
||||
attempt=0
|
||||
max_attempts=24 # 5 minutes * 24 = 2 hours
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
run: |
|
||||
status=""
|
||||
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
||||
response=$(curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY")
|
||||
echo "Response: $response"
|
||||
set +x
|
||||
status=$(echo $response | jq -r '.status')
|
||||
echo "Test status: $status"
|
||||
if [[ "$status" == "failure" ]]; then
|
||||
echo "Test failed"
|
||||
exit 1 # Fail the job step if status is failure
|
||||
elif [[ "$status" == "success" || "$status" == "null" ]]; then
|
||||
while [[ $attempt -lt $max_attempts ]]; do
|
||||
# the following command will fail until the artifacts are available ...
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
|
||||
| jq -r '.Contents[]?.Key' \
|
||||
| grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
|
||||
| sort --version-sort \
|
||||
| tail -1) || true # ... thus ignore errors from the command
|
||||
if [[ -n "${S3_KEY}" ]]; then
|
||||
echo "Artifact found: $S3_KEY"
|
||||
echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
|
||||
break
|
||||
elif [[ "$status" == "too_many_runs" ]]; then
|
||||
echo "Too many runs already running"
|
||||
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 60 # Poll every 60 seconds
|
||||
|
||||
# Increment attempt counter and sleep for 5 minutes
|
||||
attempt=$((attempt + 1))
|
||||
echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
|
||||
sleep 300 # Sleep for 5 minutes
|
||||
done
|
||||
|
||||
- name: Retrieve Test Logs
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/gzip' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
--output "test_log_${GITHUB_RUN_ID}.gz"
|
||||
if [[ -z "${S3_KEY}" ]]; then
|
||||
echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
|
||||
else
|
||||
mkdir -p $(dirname $ARCHIVE)
|
||||
time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
|
||||
mkdir -p ${NEON_DIR}
|
||||
time tar -xf ${ARCHIVE} -C ${NEON_DIR}
|
||||
rm -f ${ARCHIVE}
|
||||
fi
|
||||
|
||||
- name: Unzip Test Log and Print it into this job's log
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
- name: Download snapshots from S3
|
||||
if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
|
||||
id: download_snapshots
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
||||
cat "test_log_${GITHUB_RUN_ID}"
|
||||
# Download the snapshots from S3
|
||||
mkdir -p ${TEST_OUTPUT}
|
||||
mkdir -p $BACKUP_DIR
|
||||
cd $BACKUP_DIR
|
||||
mkdir parts
|
||||
cd parts
|
||||
PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
|
||||
| jq -r '.Contents[]?.Key' \
|
||||
| grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
|
||||
| sort \
|
||||
| tail -1)
|
||||
echo "Latest PART: $PART"
|
||||
if [[ -z "$PART" ]]; then
|
||||
echo "ERROR: No matching S3 key found" >&2
|
||||
exit 1
|
||||
fi
|
||||
S3_KEY=$(dirname $PART)
|
||||
time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
|
||||
cd $TEST_OUTPUT
|
||||
time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
|
||||
rm -rf ${BACKUP_DIR}
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
# we need high number of open files for pagebench
|
||||
- name: show ulimits
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
ulimit -a
|
||||
|
||||
- name: Run pagebench testcase
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
CI: false # need to override this env variable set by github to enforce using snapshots
|
||||
run: |
|
||||
export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
|
||||
# report the commit hash of the neon repository in the revision of the test results
|
||||
export GITHUB_SHA=${COMMIT_HASH}
|
||||
rm -rf ${PERF_REPORT_DIR}
|
||||
rm -rf ${ALLURE_RESULTS_DIR}
|
||||
mkdir -p ${PERF_REPORT_DIR}
|
||||
mkdir -p ${ALLURE_RESULTS_DIR}
|
||||
PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
|
||||
EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
|
||||
# run only two selected tests
|
||||
# environment set by parent:
|
||||
# RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
|
||||
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
|
||||
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
|
||||
|
||||
- name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export GITHUB_SHA=${COMMIT_HASH}
|
||||
time ./scripts/generate_and_push_perf_report.sh
|
||||
|
||||
- name: Upload test results
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-store
|
||||
with:
|
||||
report-dir: ${{ steps.set-env.outputs.allure_results_dir }}
|
||||
unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Upload snapshots
|
||||
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
|
||||
id: upload_snapshots
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
mkdir -p $BACKUP_DIR
|
||||
cd $TEST_OUTPUT
|
||||
tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
|
||||
cd $BACKUP_DIR
|
||||
mkdir parts
|
||||
split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
|
||||
SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD
|
||||
cd parts
|
||||
time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
|
||||
@@ -157,26 +260,22 @@ jobs:
|
||||
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
- name: Cleanup Test Resources
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d ''
|
||||
# Cleanup the test resources
|
||||
if [[ -d "${BACKUP_DIR}" ]]; then
|
||||
rm -rf ${BACKUP_DIR}
|
||||
fi
|
||||
if [[ -d "${TEST_OUTPUT}" ]]; then
|
||||
rm -rf ${TEST_OUTPUT}
|
||||
fi
|
||||
if [[ -d "${NEON_DIR}" ]]; then
|
||||
rm -rf ${NEON_DIR}
|
||||
fi
|
||||
rm -rf $(dirname $ARCHIVE)
|
||||
|
||||
- name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|
||||
|
||||
1
.gitignore
vendored
@@ -13,6 +13,7 @@ neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
pgxn/neon/communicator/communicator_bindings.h
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
673
Cargo.lock
generated
34
Cargo.toml
@@ -8,7 +8,9 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"safekeeper/client",
|
||||
@@ -23,12 +25,15 @@ members = [
|
||||
"libs/postgres_ffi",
|
||||
"libs/safekeeper_api",
|
||||
"libs/desim",
|
||||
"libs/neon-shmem",
|
||||
"libs/utils",
|
||||
"libs/consumption_metrics",
|
||||
"libs/postgres_backend",
|
||||
"libs/posthog_client_lite",
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -41,6 +46,7 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -84,6 +90,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
|
||||
clashmap = { version = "1.0", features = ["raw-api"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crossbeam-utils = "0.8.21"
|
||||
crc32c = "0.6"
|
||||
diatomic-waker = { version = "0.2.3" }
|
||||
either = "1.8"
|
||||
@@ -126,7 +133,7 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.9"
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
|
||||
# Do not update to >= 7.0.0, at least. The update will have a significant impact
|
||||
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
|
||||
notify = "6.0.0"
|
||||
@@ -142,11 +149,12 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.13"
|
||||
prost = "0.13.5"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
@@ -176,6 +184,7 @@ smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
spki = "0.7.3"
|
||||
spin = "0.9.8"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
"subtle" = "2.5.0"
|
||||
@@ -187,16 +196,16 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
|
||||
tonic-reflection = { version = "0.13.1", features = ["server"] }
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
|
||||
@@ -228,6 +237,9 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
@@ -243,40 +255,46 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neonart = { version = "0.1", path = "./libs/neonart/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
pageserver_page_api = { path = "./pageserver/page_api" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
postgres_initdb = { path = "./libs/postgres_initdb" }
|
||||
posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
safekeeper_client = { path = "./safekeeper/client" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
camino-tempfile = "1.0.2"
|
||||
tonic-build = "0.12"
|
||||
tonic-build = "0.13.1"
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
|
||||
7
Makefile
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
|
||||
@@ -155,7 +155,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.17.0
|
||||
ENV SQL_EXPORTER_VERSION=0.17.3
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.86.0
|
||||
ENV RUSTC_VERSION=1.87.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
@@ -310,13 +310,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
|
||||
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} --locked && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} --locked && \
|
||||
cargo install cargo-deny --version ${CARGO_DENY_VERSION} --locked && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} --locked && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} --locked && \
|
||||
cargo install cargo-chef --version ${CARGO_CHEF_VERSION} --locked && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} --locked \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
@@ -297,6 +297,7 @@ RUN ./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make staged-install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -582,6 +583,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "online_advisor-build"
|
||||
# compile online_advisor extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS online_advisor-src
|
||||
ARG PG_VERSION
|
||||
|
||||
# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
|
||||
# last release 1.0 - May 15, 2025
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
|
||||
echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
|
||||
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS online_advisor-build
|
||||
COPY --from=online_advisor-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/
|
||||
RUN if [ -d online_advisor-src ]; then \
|
||||
cd online_advisor-src && \
|
||||
make -j install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
|
||||
fi
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_hashids-build"
|
||||
@@ -1117,8 +1150,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
echo "#nothing to test here" > neon-test.sh
|
||||
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \
|
||||
echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \
|
||||
echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM rust-extensions-build-pgrx14 AS pgrag-build
|
||||
@@ -1148,14 +1181,14 @@ RUN cd exts/rag && \
|
||||
RUN cd exts/rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
|
||||
|
||||
RUN cd exts/rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
|
||||
|
||||
@@ -1648,6 +1681,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1751,17 +1785,17 @@ ARG TARGETARCH
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
|
||||
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
|
||||
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
|
||||
sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\
|
||||
else\
|
||||
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
|
||||
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
|
||||
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
|
||||
sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\
|
||||
fi\
|
||||
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
&& curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\
|
||||
&& curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
&& echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\
|
||||
&& echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
|
||||
@@ -1809,12 +1843,27 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute
|
||||
|
||||
FROM pg-build AS extension-tests
|
||||
ARG PG_VERSION
|
||||
# This is required for the PostGIS test
|
||||
RUN apt-get update && case $DEBIAN_VERSION in \
|
||||
bullseye) \
|
||||
apt-get install -y libproj19 libgdal28 time; \
|
||||
;; \
|
||||
bookworm) \
|
||||
apt-get install -y libgdal32 libproj25 time; \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
|
||||
;; \
|
||||
esac
|
||||
|
||||
COPY docker-compose/ext-src/ /ext-src/
|
||||
|
||||
COPY --from=pg-build /postgres /postgres
|
||||
#COPY --from=postgis-src /ext-src/ /ext-src/
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src
|
||||
COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-src /ext-src/ /ext-src/
|
||||
#COPY --from=h3-pg-src /ext-src/ /ext-src/
|
||||
COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src
|
||||
COPY --from=postgresql-unit-src /ext-src/ /ext-src/
|
||||
COPY --from=pgvector-src /ext-src/ /ext-src/
|
||||
COPY --from=pgjwt-src /ext-src/ /ext-src/
|
||||
@@ -1823,6 +1872,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_graphql-src /ext-src/ /ext-src/
|
||||
#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
|
||||
COPY --from=hypopg-src /ext-src/ /ext-src/
|
||||
COPY --from=online_advisor-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_hashids-src /ext-src/ /ext-src/
|
||||
COPY --from=rum-src /ext-src/ /ext-src/
|
||||
COPY --from=pgtap-src /ext-src/ /ext-src/
|
||||
@@ -1852,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src
|
||||
RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
@@ -1971,7 +2022,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Make the libraries we built available
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf
|
||||
RUN /sbin/ldconfig
|
||||
|
||||
# rsyslog config permissions
|
||||
# directory for rsyslogd pid file
|
||||
|
||||
1
compute/etc/ld.so.conf.d/00-neon.conf
Normal file
@@ -0,0 +1 @@
|
||||
/usr/local/lib
|
||||
121
compute/manifest.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
pg_settings:
|
||||
# Common settings for primaries and replicas of all versions.
|
||||
common:
|
||||
# Check for client disconnection every 1 minute. By default, Postgres will detect the
|
||||
# loss of the connection only at the next interaction with the socket, when it waits
|
||||
# for, receives or sends data, so it will likely waste resources till the end of the
|
||||
# query execution. There should be no drawbacks in setting this for everyone, so enable
|
||||
# it by default. If anyone will complain, we can allow editing it.
|
||||
# https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
|
||||
client_connection_check_interval: "60000" # 1 minute
|
||||
# ---- IO ----
|
||||
effective_io_concurrency: "20"
|
||||
maintenance_io_concurrency: "100"
|
||||
fsync: "off"
|
||||
hot_standby: "off"
|
||||
# We allow users to change this if needed, but by default we
|
||||
# just don't want to see long-lasting idle transactions, as they
|
||||
# prevent activity monitor from suspending projects.
|
||||
idle_in_transaction_session_timeout: "300000" # 5 minutes
|
||||
listen_addresses: "*"
|
||||
# --- LOGGING ---- helps investigations
|
||||
log_connections: "on"
|
||||
log_disconnections: "on"
|
||||
# 1GB, unit is KB
|
||||
log_temp_files: "1048576"
|
||||
# Disable dumping customer data to logs, both to increase data privacy
|
||||
# and to reduce the amount the logs.
|
||||
log_error_verbosity: "terse"
|
||||
log_min_error_statement: "panic"
|
||||
max_connections: "100"
|
||||
# --- WAL ---
|
||||
# - flush lag is the max amount of WAL that has been generated but not yet stored
|
||||
# to disk in the page server. A smaller value means less delay after a pageserver
|
||||
# restart, but if you set it too small you might again need to slow down writes if the
|
||||
# pageserver cannot flush incoming WAL to disk fast enough. This must be larger
|
||||
# than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
|
||||
# a deadlock where the compute node refuses to generate more WAL before the
|
||||
# old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
|
||||
# to be generated before it is uploaded to S3.
|
||||
max_replication_flush_lag: "10GB"
|
||||
max_replication_slots: "10"
|
||||
# Backpressure configuration:
|
||||
# - write lag is the max amount of WAL that has been generated by Postgres but not yet
|
||||
# processed by the page server. Making this smaller reduces the worst case latency
|
||||
# of a GetPage request, if you request a page that was recently modified. On the other
|
||||
# hand, if this is too small, the compute node might need to wait on a write if there is a
|
||||
# hiccup in the network or page server so that the page server has temporarily fallen
|
||||
# behind.
|
||||
#
|
||||
# Previously it was set to 500 MB, but it caused compute being unresponsive under load
|
||||
# https://github.com/neondatabase/neon/issues/2028
|
||||
max_replication_write_lag: "500MB"
|
||||
max_wal_senders: "10"
|
||||
# A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
|
||||
# of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
|
||||
# However, as long as we have full_page_writes=on, page updates after a checkpoint
|
||||
# include full-page images which bloats the WAL. So may want to bump max_wal_size to
|
||||
# reduce the WAL bloating, but at the same it will increase pg_wal directory size on
|
||||
# compute and can lead to out of disk error on k8s nodes.
|
||||
max_wal_size: "1024"
|
||||
wal_keep_size: "0"
|
||||
wal_level: "replica"
|
||||
# Reduce amount of WAL generated by default.
|
||||
wal_log_hints: "off"
|
||||
# - without wal_sender_timeout set we don't get feedback messages,
|
||||
# required for backpressure.
|
||||
wal_sender_timeout: "10000"
|
||||
# We have some experimental extensions, which we don't want users to install unconsciously.
|
||||
# To install them, users would need to set the `neon.allow_unstable_extensions` setting.
|
||||
# There are two of them currently:
|
||||
# - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
|
||||
# and two dependencies:
|
||||
# - `rag_bge_small_en_v15`
|
||||
# - `rag_jina_reranker_v1_tiny_en`
|
||||
# - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/
|
||||
neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
|
||||
neon.protocol_version: "3"
|
||||
password_encryption: "scram-sha-256"
|
||||
# This is important to prevent Postgres from trying to perform
|
||||
# a local WAL redo after backend crash. It should exit and let
|
||||
# the systemd or k8s to do a fresh startup with compute_ctl.
|
||||
restart_after_crash: "off"
|
||||
# By default 3. We have the following persistent connections in the VM:
|
||||
# * compute_activity_monitor (from compute_ctl)
|
||||
# * postgres-exporter (metrics collector; it has 2 connections)
|
||||
# * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
|
||||
# * vm-monitor (to query & change file cache size)
|
||||
# i.e. total of 6. Let's reserve 7, so there's still at least one left over.
|
||||
superuser_reserved_connections: "7"
|
||||
synchronous_standby_names: "walproposer"
|
||||
|
||||
replica:
|
||||
hot_standby: "on"
|
||||
|
||||
per_version:
|
||||
17:
|
||||
common:
|
||||
# PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
|
||||
# size. It still has some issues with readahead, though, so we default to disabled/
|
||||
# "no combining of IOs" to make sure we get the maximum prefetch depth.
|
||||
# See also: https://github.com/neondatabase/neon/pull/9860
|
||||
io_combine_limit: "1"
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
16:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
15:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
14:
|
||||
common:
|
||||
replica:
|
||||
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
|
||||
RelationGetRelationName(index));
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(index->rd_smgr);
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
initRumState(&buildstate.rumstate, index);
|
||||
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
|
||||
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
|
||||
}
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
|
||||
@@ -28,6 +28,7 @@ flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
indexmap.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
@@ -37,6 +38,7 @@ once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
p256 = { version = "0.13", features = ["pem"] }
|
||||
pageserver_page_api.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
@@ -52,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
|
||||
@@ -40,7 +40,7 @@ use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
@@ -57,31 +57,15 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
|
||||
const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
|
||||
|
||||
Ok(if arg.starts_with("http") {
|
||||
arg
|
||||
} else {
|
||||
FALLBACK_PG_EXT_GATEWAY_BASE_URL
|
||||
}
|
||||
.to_owned())
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
/// The base URL for the remote extension storage proxy gateway.
|
||||
/// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
#[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
/// outside the compute will talk to the compute through this port. Keep
|
||||
@@ -136,6 +120,29 @@ struct Cli {
|
||||
requires = "compute-id"
|
||||
)]
|
||||
pub control_plane_uri: Option<String>,
|
||||
|
||||
/// Interval in seconds for collecting installed extensions statistics
|
||||
#[arg(long, default_value = "3600")]
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
}
|
||||
|
||||
impl Cli {
|
||||
/// Parse a URL from an argument. By default, this isn't necessary, but we
|
||||
/// want to do some sanity checking.
|
||||
fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
|
||||
// Remove extra trailing slashes, and add one. We use Url::join() later
|
||||
// when downloading remote extensions. If the base URL is something like
|
||||
// http://example.com/pg-ext-s3-gateway, and join() is called with
|
||||
// something like "xyz", the resulting URL is http://example.com/xyz.
|
||||
let value = value.trim_end_matches('/').to_owned() + "/";
|
||||
let url = Url::parse(&value)?;
|
||||
|
||||
if url.query_pairs().count() != 0 {
|
||||
bail!("parameters detected in remote extensions base URL")
|
||||
}
|
||||
|
||||
Ok(url)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
@@ -179,6 +186,7 @@ fn main() -> Result<()> {
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
installed_extensions_collection_interval: cli.installed_extensions_collection_interval,
|
||||
},
|
||||
config,
|
||||
)?;
|
||||
@@ -263,7 +271,8 @@ fn handle_exit_signal(sig: i32) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use clap::CommandFactory;
|
||||
use clap::{CommandFactory, Parser};
|
||||
use url::Url;
|
||||
|
||||
use super::Cli;
|
||||
|
||||
@@ -273,16 +282,41 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_ext_gateway_base_url() {
|
||||
let arg = "http://pg-ext-s3-gateway2";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
assert_eq!(result, arg);
|
||||
|
||||
let arg = "pg-ext-s3-gateway";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
fn verify_remote_ext_base_url() {
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com/subpath",
|
||||
]);
|
||||
assert_eq!(
|
||||
result,
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com/subpath/").unwrap()
|
||||
);
|
||||
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com//",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com").unwrap()
|
||||
);
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com?hello=world",
|
||||
])
|
||||
.expect_err("URL parameters are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,6 +339,8 @@ async fn run_dump_restore(
|
||||
destination_connstring: String,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let dumpdir = workdir.join("dumpdir");
|
||||
let num_jobs = num_cpus::get().to_string();
|
||||
info!("using {num_jobs} jobs for dump/restore");
|
||||
|
||||
let common_args = [
|
||||
// schema mapping (prob suffices to specify them on one side)
|
||||
@@ -354,7 +356,7 @@ async fn run_dump_restore(
|
||||
"directory".to_string(),
|
||||
// concurrency
|
||||
"--jobs".to_string(),
|
||||
num_cpus::get().to_string(),
|
||||
num_jobs,
|
||||
// progress updates
|
||||
"--verbose".to_string(),
|
||||
];
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
@@ -11,14 +11,16 @@ use compute_api::spec::{
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::SocketAddr;
|
||||
use std::os::unix::fs::{PermissionsExt, symlink};
|
||||
use std::path::Path;
|
||||
@@ -28,8 +30,11 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::spawn;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
@@ -95,7 +100,10 @@ pub struct ComputeNodeParams {
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// the address of extension storage proxy gateway
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// Interval for installed extensions collection
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
}
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -364,7 +372,7 @@ impl ComputeNode {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -391,7 +399,7 @@ impl ComputeNode {
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if cli_spec.is_none() {
|
||||
this.prewarm_postgres()?;
|
||||
this.prewarm_postgres_vm_memory()?;
|
||||
}
|
||||
|
||||
// Set the up metric with Empty status before starting the HTTP server.
|
||||
@@ -598,6 +606,8 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// If there are any remote extensions in shared_preload_libraries, start downloading them
|
||||
if pspec.spec.remote_extensions.is_some() {
|
||||
let (this, spec) = (self.clone(), pspec.spec.clone());
|
||||
@@ -654,7 +664,7 @@ impl ComputeNode {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -673,7 +683,10 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
let _handle = tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -694,25 +707,18 @@ impl ComputeNode {
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
|
||||
// Add project_id,endpoint_id tag to identify the logs.
|
||||
// Add project_id,endpoint_id to identify the logs.
|
||||
//
|
||||
// These ids are passed from cplane,
|
||||
// for backwards compatibility (old computes that don't have them),
|
||||
// we set them to None.
|
||||
// TODO: Clean up this code when all computes have them.
|
||||
let tag: Option<String> = match (
|
||||
pspec.spec.project_id.as_deref(),
|
||||
pspec.spec.endpoint_id.as_deref(),
|
||||
) {
|
||||
(Some(project_id), Some(endpoint_id)) => {
|
||||
Some(format!("{project_id}/{endpoint_id}"))
|
||||
}
|
||||
(Some(project_id), None) => Some(format!("{project_id}/None")),
|
||||
(None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
|
||||
(None, None) => None,
|
||||
};
|
||||
let endpoint_id = pspec.spec.endpoint_id.as_deref().unwrap_or("");
|
||||
let project_id = pspec.spec.project_id.as_deref().unwrap_or("");
|
||||
|
||||
configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;
|
||||
configure_audit_rsyslog(
|
||||
log_directory_path.clone(),
|
||||
endpoint_id,
|
||||
project_id,
|
||||
&remote_endpoint,
|
||||
)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
@@ -748,17 +754,7 @@ impl ComputeNode {
|
||||
|
||||
let conf = self.get_tokio_conn_conf(None);
|
||||
tokio::task::spawn(async {
|
||||
let res = get_installed_extensions(conf).await;
|
||||
match res {
|
||||
Ok(extensions) => {
|
||||
info!(
|
||||
"[NEON_EXT_STAT] {}",
|
||||
serde_json::to_string(&extensions)
|
||||
.expect("failed to serialize extensions list")
|
||||
);
|
||||
}
|
||||
Err(err) => error!("could not get installed extensions: {err:?}"),
|
||||
}
|
||||
let _ = installed_extensions(conf).await;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -788,7 +784,10 @@ impl ComputeNode {
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
|
||||
|
||||
if pspec.spec.prewarm_lfc_on_startup {
|
||||
// Spawn the extension stats background task
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.autoprewarm {
|
||||
self.prewarm_lfc();
|
||||
}
|
||||
Ok(())
|
||||
@@ -945,6 +944,77 @@ impl ComputeNode {
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
|
||||
match Url::parse(shard0_connstr)?.scheme() {
|
||||
"postgres" | "postgresql" => self.try_get_basebackup_libpq(spec, lsn),
|
||||
"grpc" => self.try_get_basebackup_grpc(spec, lsn),
|
||||
scheme => return Err(anyhow!("unknown URL scheme {scheme}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
let chunks = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::proto::PageServiceClient::connect(shard0_connstr).await?;
|
||||
|
||||
let req = page_api::proto::GetBaseBackupRequest {
|
||||
read_lsn: Some(page_api::proto::ReadLsn {
|
||||
request_lsn: lsn.0,
|
||||
not_modified_since_lsn: 0,
|
||||
}),
|
||||
replica: false, // TODO: handle replicas, with LSN 0
|
||||
};
|
||||
let mut req = tonic::Request::new(req);
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", spec.tenant_id.to_string().parse()?);
|
||||
metadata.insert("neon-timeline-id", spec.timeline_id.to_string().parse()?);
|
||||
metadata.insert("neon-shard-id", "0000".to_string().parse()?); // TODO: shard count
|
||||
if let Some(auth) = spec.storage_auth_token.as_ref() {
|
||||
metadata.insert("authorization", format!("Bearer {auth}").parse()?);
|
||||
}
|
||||
|
||||
let chunks = client.get_base_backup(req).await?.into_inner();
|
||||
anyhow::Ok(chunks)
|
||||
})?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
// Convert the chunks stream into an AsyncRead
|
||||
let stream_reader = StreamReader::new(
|
||||
chunks.map(|chunk| chunk.map(|c| c.chunk).map_err(std::io::Error::other)),
|
||||
);
|
||||
|
||||
// Wrap the AsyncRead into a blocking reader for compatibility with tar::Archive
|
||||
let reader = tokio_util::io::SyncIoBridge::new(stream_reader);
|
||||
let mut measured_reader = MeasuredReader::new(reader);
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.params.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
@@ -960,12 +1030,10 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
config.application_name("compute_ctl");
|
||||
if let Some(spec) = &compute_state.pspec {
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
}
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
@@ -1039,10 +1107,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1214,13 +1279,15 @@ impl ComputeNode {
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&pspec.spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1316,8 +1383,8 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres(&self) -> Result<()> {
|
||||
info!("prewarming");
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
let pgdata = &format!("{}.warmup", self.params.pgdata);
|
||||
@@ -1359,7 +1426,7 @@ impl ComputeNode {
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
info!("done prewarming vm memory");
|
||||
|
||||
// clean up
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
@@ -1545,14 +1612,22 @@ impl ComputeNode {
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let mut tls_config = None::<TlsConfig>;
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
|
||||
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
|
||||
|
||||
// Merge-apply spec & changes to PostgreSQL state.
|
||||
self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
|
||||
|
||||
if let Some(local_proxy) = &spec.clone().local_proxy_config {
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
info!("configuring local_proxy");
|
||||
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
|
||||
local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
@@ -1604,11 +1679,13 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -1626,7 +1703,7 @@ impl ComputeNode {
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = self.compute_ctl_config.tls.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -1644,7 +1721,7 @@ impl ComputeNode {
|
||||
pgdata_path,
|
||||
&spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
@@ -1764,6 +1841,14 @@ impl ComputeNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
&self.compute_ctl_config.tls
|
||||
} else {
|
||||
&None::<TlsConfig>
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
@@ -1900,7 +1985,7 @@ LIMIT 100",
|
||||
self.params
|
||||
.remote_ext_base_url
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
.ok_or(DownloadError::BadInput(anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
@@ -1995,23 +2080,40 @@ LIMIT 100",
|
||||
tokio::spawn(conn);
|
||||
|
||||
// TODO: support other types of grants apart from schemas?
|
||||
let query = format!(
|
||||
"GRANT {} ON SCHEMA {} TO {}",
|
||||
privileges
|
||||
.iter()
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.collect::<Vec<&'static str>>()
|
||||
.join(", "),
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
schema_name.pg_quote(),
|
||||
role_name.pg_quote(),
|
||||
);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
|
||||
// check the role grants first - to gracefully handle read-replicas.
|
||||
let select = "SELECT privilege_type
|
||||
FROM pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
|
||||
JOIN pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename = $1
|
||||
AND nspname = $2";
|
||||
let rows = db_client
|
||||
.query(select, &[role_name, schema_name])
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
.with_context(|| format!("Failed to execute query: {select}"))?;
|
||||
|
||||
let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
|
||||
|
||||
let grants = privileges
|
||||
.iter()
|
||||
.filter(|p| !already_granted.contains(p.as_str()))
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.join(", ");
|
||||
|
||||
if !grants.is_empty() {
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
let schema_name = schema_name.pg_quote();
|
||||
let role_name = role_name.pg_quote();
|
||||
|
||||
let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2079,7 +2181,7 @@ LIMIT 100",
|
||||
let remote_extensions = spec
|
||||
.remote_extensions
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
|
||||
.ok_or(anyhow!("Remote extensions are not configured"))?;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
@@ -2181,6 +2283,41 @@ LIMIT 100",
|
||||
info!("Pageserver config changed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_extension_stats_task(&self) {
|
||||
let conf = self.tokio_conn_conf.clone();
|
||||
let installed_extensions_collection_interval =
|
||||
self.params.installed_extensions_collection_interval;
|
||||
tokio::spawn(async move {
|
||||
// An initial sleep is added to ensure that two collections don't happen at the same time.
|
||||
// The first collection happens during compute startup.
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(
|
||||
installed_extensions_collection_interval,
|
||||
))
|
||||
.await;
|
||||
let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
|
||||
installed_extensions_collection_interval,
|
||||
));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let _ = installed_extensions(conf.clone()).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
let res = get_installed_extensions(conf).await;
|
||||
match res {
|
||||
Ok(extensions) => {
|
||||
info!(
|
||||
"[NEON_EXT_STAT] {}",
|
||||
serde_json::to_string(&extensions).expect("failed to serialize extensions list")
|
||||
);
|
||||
}
|
||||
Err(err) => error!("could not get installed extensions: {err:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
|
||||
@@ -224,7 +224,10 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "pgaudit.log_rotation_age=5")?;
|
||||
|
||||
// Enable audit logs for pg_session_jwt extension
|
||||
writeln!(file, "pg_session_jwt.audit_log=on")?;
|
||||
// TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as
|
||||
// pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863
|
||||
//
|
||||
// writeln!(file, "pg_session_jwt.audit_log=on")?;
|
||||
|
||||
// Add audit shared_preload_libraries, if they are not present.
|
||||
//
|
||||
|
||||
@@ -2,10 +2,24 @@
|
||||
module(load="imfile")
|
||||
|
||||
# Input configuration for log files in the specified directory
|
||||
# Replace {log_directory} with the directory containing the log files
|
||||
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
|
||||
# The messages can be multiline. The start of the message is a timestamp
|
||||
# in "%Y-%m-%d %H:%M:%S.%3N GMT" (so timezone hardcoded).
|
||||
# Replace log_directory with the directory containing the log files
|
||||
input(type="imfile" File="{log_directory}/*.log"
|
||||
Tag="pgaudit_log" Severity="info" Facility="local5"
|
||||
startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,")
|
||||
|
||||
# the directory to store rsyslog state files
|
||||
global(workDirectory="/var/log/rsyslog")
|
||||
|
||||
# Forward logs to remote syslog server
|
||||
*.* @@{remote_endpoint}
|
||||
# Construct json, endpoint_id and project_id as additional metadata
|
||||
set $.json_log!endpoint_id = "{endpoint_id}";
|
||||
set $.json_log!project_id = "{project_id}";
|
||||
set $.json_log!msg = $msg;
|
||||
|
||||
# Template suitable for rfc5424 syslog format
|
||||
template(name="PgAuditLog" type="string"
|
||||
string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%")
|
||||
|
||||
# Forward to remote syslog receiver (@@<hostname>:<port>;format
|
||||
local5.info @@{remote_endpoint};PgAuditLog
|
||||
|
||||
@@ -83,6 +83,7 @@ use reqwest::StatusCode;
|
||||
use tar::Archive;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use url::Url;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
|
||||
@@ -158,7 +159,7 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
remote_ext_base_url: &str,
|
||||
remote_ext_base_url: &Url,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
@@ -270,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", remote_ext_base_url, ext_path);
|
||||
async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = remote_ext_base_url.join(ext_path).with_context(|| {
|
||||
format!(
|
||||
"failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
|
||||
)
|
||||
})?;
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
@@ -283,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
match do_extension_server_request(uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
@@ -302,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
// Do a single remote extensions server request.
|
||||
// Return result or (error message + stringified status code) in case of any failures.
|
||||
async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
|
||||
async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
|
||||
let resp = reqwest::get(uri).await.map_err(|e| {
|
||||
(
|
||||
format!(
|
||||
|
||||
@@ -48,11 +48,9 @@ impl JsonResponse {
|
||||
|
||||
/// Create an error response related to the compute being in an invalid state
|
||||
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
|
||||
Self::create_response(
|
||||
Self::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
&GenericAPIError {
|
||||
error: format!("invalid compute status: {status}"),
|
||||
},
|
||||
format!("invalid compute status: {status}"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
let pspec = match ParsedSpec::try_from(request.0.spec) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
/// Struct to store runtime state of the compute monitor thread.
|
||||
/// In theory, this could be a part of `Compute`, but i)
|
||||
/// this state is expected to be accessed only by single thread,
|
||||
/// so we don't need to care about locking; ii) `Compute` is
|
||||
/// already quite big. Thus, it seems to be a good idea to keep
|
||||
/// all the activity/health monitoring parts here.
|
||||
struct ComputeMonitor {
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
@@ -70,12 +76,36 @@ impl ComputeMonitor {
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if compute is in some terminal or soon-to-be-terminal
|
||||
/// state, then return `true`, signalling the caller that it
|
||||
/// should exit gracefully. Otherwise, return `false`.
|
||||
fn check_interrupts(&mut self) -> bool {
|
||||
let compute_status = self.compute.get_status();
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
"compute is in {} status, stopping compute monitor",
|
||||
compute_status
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
/// Then update it in the shared state. This function never errors out.
|
||||
/// Then update it in the shared state. This function currently never
|
||||
/// errors out explicitly, but there is a graceful termination path.
|
||||
/// Every time we receive an error trying to check Postgres, we use
|
||||
/// [`ComputeMonitor::check_interrupts()`] because it could be that
|
||||
/// compute is being terminated already, then we can exit gracefully
|
||||
/// to not produce errors' noise in the log.
|
||||
/// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
||||
/// should be handled gracefully.
|
||||
#[instrument(skip_all)]
|
||||
pub fn run(&mut self) {
|
||||
pub fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = self.compute.params.connstr.clone();
|
||||
let conf = self
|
||||
@@ -93,6 +123,10 @@ impl ComputeMonitor {
|
||||
info!("starting compute monitor for {}", connstr);
|
||||
|
||||
loop {
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
match &mut client {
|
||||
Ok(cli) => {
|
||||
if cli.is_closed() {
|
||||
@@ -100,6 +134,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"connection to Postgres is closed, trying to reconnect"
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
@@ -111,15 +149,19 @@ impl ComputeMonitor {
|
||||
self.compute.update_last_active(self.last_active);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Although we have many places where we can return errors in `check()`,
|
||||
// normally it shouldn't happen. I.e., we will likely return error if
|
||||
// connection got broken, query timed out, Postgres returned invalid data, etc.
|
||||
// In all such cases it's suspicious, so let's report this as downtime.
|
||||
self.report_down();
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
|
||||
// Reconnect to Postgres just in case. During tests, I noticed
|
||||
// that queries in `check()` can fail with `connection closed`,
|
||||
@@ -136,6 +178,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not connect to Postgres: {}, retrying", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Establish a new connection and try again.
|
||||
@@ -147,6 +193,9 @@ impl ComputeMonitor {
|
||||
self.last_checked = Utc::now();
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
}
|
||||
|
||||
// Graceful termination path
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
match monitor.run() {
|
||||
Ok(_) => info!("compute monitor thread terminated gracefully"),
|
||||
Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
|
||||
}
|
||||
})
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -213,8 +213,10 @@ impl Escaping for PgIdent {
|
||||
|
||||
// Find the first suitable tag that is not present in the string.
|
||||
// Postgres' max role/DB name length is 63 bytes, so even in the
|
||||
// worst case it won't take long.
|
||||
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
|
||||
// worst case it won't take long. Outer tag is always `tag + "x"`,
|
||||
// so if `tag` is not present in the string, `outer_tag` is not
|
||||
// present in the string either.
|
||||
while self.contains(&tag.to_string()) {
|
||||
tag += "x";
|
||||
outer_tag = tag.clone() + "x";
|
||||
}
|
||||
|
||||
@@ -27,6 +27,40 @@ fn get_rsyslog_pid() -> Option<String> {
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
|
||||
const MAX_WAIT: Duration = Duration::from_secs(5);
|
||||
const INITIAL_SLEEP: Duration = Duration::from_millis(2);
|
||||
|
||||
let mut sleep_duration = INITIAL_SLEEP;
|
||||
let start = std::time::Instant::now();
|
||||
let mut attempts = 1;
|
||||
|
||||
for attempt in 1.. {
|
||||
attempts = attempt;
|
||||
match get_rsyslog_pid() {
|
||||
Some(pid) => return Ok(pid),
|
||||
None => {
|
||||
if start.elapsed() >= MAX_WAIT {
|
||||
break;
|
||||
}
|
||||
info!(
|
||||
"rsyslogd is not running, attempt {}. Sleeping for {} ms",
|
||||
attempt,
|
||||
sleep_duration.as_millis()
|
||||
);
|
||||
std::thread::sleep(sleep_duration);
|
||||
sleep_duration *= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"rsyslogd is not running after waiting for {} seconds and {} attempts",
|
||||
attempts,
|
||||
start.elapsed().as_secs()
|
||||
))
|
||||
}
|
||||
|
||||
// Restart rsyslogd to apply the new configuration.
|
||||
// This is necessary, because there is no other way to reload the rsyslog configuration.
|
||||
//
|
||||
@@ -36,27 +70,29 @@ fn get_rsyslog_pid() -> Option<String> {
|
||||
// TODO: test it properly
|
||||
//
|
||||
fn restart_rsyslog() -> Result<()> {
|
||||
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
|
||||
info!("rsyslogd is running with pid: {}, restart it", old_pid);
|
||||
|
||||
// kill it to restart
|
||||
let _ = Command::new("pkill")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.context("Failed to stop rsyslogd")?;
|
||||
.context("Failed to restart rsyslogd")?;
|
||||
|
||||
// ensure rsyslogd is running
|
||||
wait_for_rsyslog_pid()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn configure_audit_rsyslog(
|
||||
log_directory: String,
|
||||
tag: Option<String>,
|
||||
endpoint_id: &str,
|
||||
project_id: &str,
|
||||
remote_endpoint: &str,
|
||||
) -> Result<()> {
|
||||
let config_content: String = format!(
|
||||
include_str!("config_template/compute_audit_rsyslog_template.conf"),
|
||||
log_directory = log_directory,
|
||||
tag = tag.unwrap_or("".to_string()),
|
||||
endpoint_id = endpoint_id,
|
||||
project_id = project_id,
|
||||
remote_endpoint = remote_endpoint
|
||||
);
|
||||
|
||||
@@ -131,15 +167,11 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// When new config is empty we can simply remove the configuration file.
|
||||
// Nothing to configure
|
||||
if new_config.is_empty() {
|
||||
info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
|
||||
match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
|
||||
Ok(_) => {}
|
||||
Err(err) if err.kind() == ErrorKind::NotFound => {}
|
||||
Err(err) => return Err(err.into()),
|
||||
}
|
||||
restart_rsyslog()?;
|
||||
// When the configuration is removed, PostgreSQL will stop sending data
|
||||
// to the files watched by rsyslog, so restarting rsyslog is more effort
|
||||
// than just ignoring this change.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
prewarm_lfc_on_startup = off
|
||||
autoprewarm = off
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
@@ -71,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
("name$$$", ("$x$name$$$$x$", "xx")),
|
||||
("name$$$$", ("$x$name$$$$$x$", "xx")),
|
||||
("name$x$", ("$xx$name$x$$xx$", "xxx")),
|
||||
("x", ("$xx$x$xx$", "xxx")),
|
||||
("xx", ("$xxx$xx$xxx$", "xxxx")),
|
||||
("$x", ("$xx$$x$xx$", "xxx")),
|
||||
("x$", ("$xx$x$$xx$", "xxx")),
|
||||
("$x$", ("$xx$$x$$xx$", "xxx")),
|
||||
("xx$", ("$xxx$xx$$xxx$", "xxxx")),
|
||||
("$xx", ("$xxx$$xx$xxx$", "xxxx")),
|
||||
("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
|
||||
];
|
||||
|
||||
for (input, expected) in test_cases {
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
[pageserver]
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
listen_grpc_addr = '127.0.0.1:51051'
|
||||
pg_auth_type = 'Trust'
|
||||
http_auth_type = 'Trust'
|
||||
grpc_auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
id = 1
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
id=1
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
listen_grpc_addr = '127.0.0.1:51051'
|
||||
pg_auth_type = 'Trust'
|
||||
http_auth_type = 'Trust'
|
||||
grpc_auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
id = 1
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::fd::AsFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
@@ -356,7 +356,7 @@ where
|
||||
let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
|
||||
// Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
|
||||
// remains locked after exec.
|
||||
nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
|
||||
nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
|
||||
.expect("remove FD_CLOEXEC");
|
||||
// Don't run drop(file), it would close the file before we actually exec.
|
||||
std::mem::forget(file);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::File;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
@@ -19,7 +18,7 @@ use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -31,8 +30,9 @@ use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use nix::fcntl::{FlockArg, flock};
|
||||
use nix::fcntl::{Flock, FlockArg};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT,
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
@@ -664,6 +664,10 @@ struct EndpointStartCmdArgs {
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
/// If enabled, use gRPC (and the communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -682,6 +686,10 @@ struct EndpointReconfigureCmdArgs {
|
||||
|
||||
#[clap(long)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
/// If enabled, use gRPC (and communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -749,16 +757,16 @@ struct TimelineTreeEl {
|
||||
|
||||
/// A flock-based guard over the neon_local repository directory
|
||||
struct RepoLock {
|
||||
_file: File,
|
||||
_file: Flock<File>,
|
||||
}
|
||||
|
||||
impl RepoLock {
|
||||
fn new() -> Result<Self> {
|
||||
let repo_dir = File::open(local_env::base_path())?;
|
||||
let repo_dir_fd = repo_dir.as_raw_fd();
|
||||
flock(repo_dir_fd, FlockArg::LockExclusive)?;
|
||||
|
||||
Ok(Self { _file: repo_dir })
|
||||
match Flock::lock(repo_dir, FlockArg::LockExclusive) {
|
||||
Ok(f) => Ok(Self { _file: f }),
|
||||
Err((_, e)) => Err(e).context("flock error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1008,13 +1016,16 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
|
||||
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
|
||||
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
|
||||
let grpc_port = DEFAULT_PAGESERVER_GRPC_PORT + i;
|
||||
NeonLocalInitPageserverConf {
|
||||
id: pageserver_id,
|
||||
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
|
||||
listen_http_addr: format!("127.0.0.1:{http_port}"),
|
||||
listen_https_addr: None,
|
||||
listen_grpc_addr: Some(format!("127.0.0.1:{grpc_port}")),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
grpc_auth_type: AuthType::Trust,
|
||||
other: Default::default(),
|
||||
// Typical developer machines use disks with slow fsync, and we don't care
|
||||
// about data integrity: disable disk syncs.
|
||||
@@ -1276,6 +1287,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
||||
mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
|
||||
ancestor_timeline_id,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
read_only: false,
|
||||
pg_version: None,
|
||||
},
|
||||
};
|
||||
@@ -1448,13 +1460,18 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
// Use gRPC if requested.
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![(protocol, host, port)], DEFAULT_STRIPE_SIZE)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
@@ -1473,11 +1490,22 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.await?;
|
||||
}
|
||||
|
||||
anyhow::Ok((
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
))
|
||||
let pageserver = if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -1532,11 +1560,17 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
||||
vec![(
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
vec![(protocol, host, port)]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
@@ -1545,11 +1579,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
@@ -74,7 +75,6 @@ use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -331,7 +331,7 @@ pub enum EndpointStatus {
|
||||
RunningNoPidfile,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EndpointStatus {
|
||||
impl Display for EndpointStatus {
|
||||
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
Self::Running => "running",
|
||||
@@ -343,6 +343,28 @@ impl std::fmt::Display for EndpointStatus {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PageserverProtocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
impl PageserverProtocol {
|
||||
/// Returns the URL scheme for the protocol, used in connstrings.
|
||||
pub fn scheme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Libpq => "postgresql",
|
||||
Self::Grpc => "grpc",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PageserverProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.scheme())
|
||||
}
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -606,10 +628,10 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
@@ -654,7 +676,7 @@ impl Endpoint {
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
@@ -747,7 +769,7 @@ impl Endpoint {
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
prewarm_lfc_on_startup: false,
|
||||
autoprewarm: false,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -939,10 +961,12 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
@@ -954,25 +978,7 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
if pageservers.is_empty() {
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
|
||||
@@ -278,8 +278,10 @@ pub struct PageServerConf {
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub grpc_auth_type: AuthType,
|
||||
pub no_sync: bool,
|
||||
}
|
||||
|
||||
@@ -290,8 +292,10 @@ impl Default for PageServerConf {
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
listen_https_addr: None,
|
||||
listen_grpc_addr: None,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
grpc_auth_type: AuthType::Trust,
|
||||
no_sync: false,
|
||||
}
|
||||
}
|
||||
@@ -306,8 +310,10 @@ pub struct NeonLocalInitPageserverConf {
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub grpc_auth_type: AuthType,
|
||||
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
|
||||
pub no_sync: bool,
|
||||
#[serde(flatten)]
|
||||
@@ -321,8 +327,10 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
listen_grpc_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
grpc_auth_type,
|
||||
no_sync,
|
||||
other: _,
|
||||
} = conf;
|
||||
@@ -331,7 +339,9 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_pg_addr: listen_pg_addr.clone(),
|
||||
listen_http_addr: listen_http_addr.clone(),
|
||||
listen_https_addr: listen_https_addr.clone(),
|
||||
listen_grpc_addr: listen_grpc_addr.clone(),
|
||||
pg_auth_type: *pg_auth_type,
|
||||
grpc_auth_type: *grpc_auth_type,
|
||||
http_auth_type: *http_auth_type,
|
||||
no_sync: *no_sync,
|
||||
}
|
||||
@@ -707,8 +717,10 @@ impl LocalEnv {
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
listen_https_addr: Option<String>,
|
||||
listen_grpc_addr: Option<String>,
|
||||
pg_auth_type: AuthType,
|
||||
http_auth_type: AuthType,
|
||||
grpc_auth_type: AuthType,
|
||||
#[serde(default)]
|
||||
no_sync: bool,
|
||||
}
|
||||
@@ -732,8 +744,10 @@ impl LocalEnv {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
listen_grpc_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
grpc_auth_type,
|
||||
no_sync,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
@@ -750,8 +764,10 @@ impl LocalEnv {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
listen_grpc_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
grpc_auth_type,
|
||||
no_sync,
|
||||
};
|
||||
pageservers.push(conf);
|
||||
|
||||
@@ -129,7 +129,9 @@ impl PageServerNode {
|
||||
));
|
||||
}
|
||||
|
||||
if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
|
||||
if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type]
|
||||
.contains(&AuthType::NeonJWT)
|
||||
{
|
||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||
// are one level below that, so refer to keys with ../
|
||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||
@@ -263,6 +265,14 @@ impl PageServerNode {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut grpc_host = None;
|
||||
let mut grpc_port = None;
|
||||
if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
|
||||
let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
|
||||
grpc_host = Some("localhost".to_string());
|
||||
grpc_port = Some(port.unwrap_or(51051));
|
||||
}
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -271,6 +281,8 @@ impl PageServerNode {
|
||||
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
grpc_host,
|
||||
grpc_port,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
https_port,
|
||||
@@ -511,11 +523,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'timeline_offloading' as bool")?,
|
||||
wal_receiver_protocol_override: settings
|
||||
.remove("wal_receiver_protocol_override")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `wal_receiver_protocol_override` from json")?,
|
||||
rel_size_v2_enabled: settings
|
||||
.remove("rel_size_v2_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
@@ -546,6 +553,16 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Falied to parse 'sampling_ratio'")?,
|
||||
relsize_snapshot_cache_capacity: settings
|
||||
.remove("relsize snapshot cache capacity")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
|
||||
basebackup_cache_enabled: settings
|
||||
.remove("basebackup_cache_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'basebackup_cache_enabled' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
|
||||
@@ -37,6 +37,11 @@ enum Command {
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
|
||||
#[arg(long)]
|
||||
listen_grpc_addr: Option<String>,
|
||||
#[arg(long)]
|
||||
listen_grpc_port: Option<u16>,
|
||||
|
||||
#[arg(long)]
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
@@ -410,6 +415,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
@@ -423,6 +430,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
|
||||
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
jq \
|
||||
netcat-openbsd
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src
|
||||
|
||||
USER postgres
|
||||
|
||||
@@ -1,28 +1,36 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux
|
||||
|
||||
# Generate a random tenant or timeline ID
|
||||
#
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
local -n resvar=${1}
|
||||
printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
|
||||
}
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
readonly CONFIG_FILE=/tmp/config.json
|
||||
|
||||
# Test that the first library path that the dynamic loader looks in is the path
|
||||
# that we use for custom compiled software
|
||||
first_path="$(ldconfig --verbose 2>/dev/null \
|
||||
| grep --invert-match ^$'\t' \
|
||||
| cut --delimiter=: --fields=1 \
|
||||
| head --lines=1)"
|
||||
test "${first_path}" = '/usr/local/lib'
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
sleep 1
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
timeline_id=${TIMELINE_ID}
|
||||
else
|
||||
@@ -33,7 +41,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant"
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
|
||||
if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
|
||||
if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
|
||||
echo "Create a tenant"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
@@ -43,7 +51,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
@@ -53,7 +61,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
@@ -63,7 +71,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -74,10 +82,10 @@ else
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
|
||||
sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}
|
||||
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
@@ -85,5 +93,5 @@ echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
--config "$CONFIG_FILE"
|
||||
--compute-id "compute-${RANDOM}" \
|
||||
--config "${CONFIG_FILE}"
|
||||
|
||||
@@ -186,13 +186,14 @@ services:
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
environment:
|
||||
- PGPASSWORD=cloud_admin
|
||||
- PGUSER=${PGUSER:-cloud_admin}
|
||||
- PGPASSWORD=${PGPASSWORD:-cloud_admin}
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- sleep 1800
|
||||
- sleep 3600
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
@@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
|
||||
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
|
||||
@@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
|
||||
8
docker-compose/ext-src/alter_db.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
# We need these settings to get the expected output results.
|
||||
# We cannot use the environment variables e.g. PGTZ due to
|
||||
# https://github.com/neondatabase/neon/issues/1287
|
||||
export DATABASE=${1:-contrib_regression}
|
||||
psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \
|
||||
-c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \
|
||||
-c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \
|
||||
16
docker-compose/ext-src/h3-pg-src/neon-test.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
dropdb --if-exists contrib_regression
|
||||
createdb contrib_regression
|
||||
cd h3_postgis/test
|
||||
psql -d contrib_regression -c "CREATE EXTENSION postgis" -c "CREATE EXTENSION postgis_raster" -c "CREATE EXTENSION h3" -c "CREATE EXTENSION h3_postgis"
|
||||
TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
|
||||
${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS}
|
||||
cd ../../h3/test
|
||||
TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
|
||||
dropdb --if-exists contrib_regression
|
||||
createdb contrib_regression
|
||||
psql -d contrib_regression -c "CREATE EXTENSION h3"
|
||||
${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS}
|
||||
7
docker-compose/ext-src/h3-pg-src/test-upgrade.sh
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
cd h3/test
|
||||
TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression ${TESTS}
|
||||
6
docker-compose/ext-src/online_advisor-src/neon-test.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
if [ -f Makefile ]; then
|
||||
make installcheck
|
||||
fi
|
||||
9
docker-compose/ext-src/online_advisor-src/regular-test.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
[ -f Makefile ] || exit 0
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression ${TESTS}
|
||||
@@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/}
|
||||
TESTS=${TESTS/sqli_connection/}
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
. ../alter_db.sh
|
||||
psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
|
||||
${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
. ../alter_db.sh
|
||||
psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions
|
||||
|
||||
@@ -20,5 +20,6 @@ installcheck: regression-test
|
||||
regression-test:
|
||||
dropdb --if-exists contrib_regression
|
||||
createdb contrib_regression
|
||||
../alter_db.sh
|
||||
psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)"
|
||||
$(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS)
|
||||
|
||||
@@ -3,6 +3,7 @@ set -ex
|
||||
cd "$(dirname ${0})"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
. ../alter_db.sh
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')"
|
||||
REGRESS="${REGRESS/startup_perms/}"
|
||||
|
||||
70
docker-compose/ext-src/postgis-src/README-Neon.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# PostGIS Testing in Neon
|
||||
|
||||
This directory contains configuration files and patches for running PostGIS tests in the Neon database environment.
|
||||
|
||||
## Overview
|
||||
|
||||
PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension.
|
||||
|
||||
## PostGIS Versions
|
||||
|
||||
- PostgreSQL v17: PostGIS 3.5.0
|
||||
- PostgreSQL v14/v15/v16: PostGIS 3.3.3
|
||||
|
||||
## Test Configuration
|
||||
|
||||
The test setup includes:
|
||||
|
||||
- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk
|
||||
- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16
|
||||
- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17
|
||||
- `regular-test.sh`: Script to run PostGIS tests as a regular user
|
||||
- `neon-test.sh`: Script to handle version-specific test configurations
|
||||
- `raster_outdb_template.sql`: Template for raster tests with explicit file paths
|
||||
|
||||
## Excluded Tests
|
||||
|
||||
**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions.
|
||||
|
||||
### Tests Requiring Superuser Permissions
|
||||
|
||||
These tests cannot be run as a regular user:
|
||||
- `estimatedextent`
|
||||
- `regress/core/legacy`
|
||||
- `regress/core/typmod`
|
||||
- `regress/loader/TestSkipANALYZE`
|
||||
- `regress/loader/TestANALYZE`
|
||||
|
||||
### Tests Requiring Filesystem Access
|
||||
|
||||
These tests need direct filesystem access that is only possible for superusers:
|
||||
- `loader/load_outdb`
|
||||
|
||||
### Tests with Flaky Results
|
||||
|
||||
These tests have assumptions that don't always hold true:
|
||||
- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true
|
||||
|
||||
### Tests Requiring Tunable Parameter Modifications
|
||||
|
||||
These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers:
|
||||
- `raster/test/regress/rt_wkb`
|
||||
- `raster/test/regress/rt_addband`
|
||||
- `raster/test/regress/rt_setbandpath`
|
||||
- `raster/test/regress/rt_fromgdalraster`
|
||||
- `raster/test/regress/rt_asgdalraster`
|
||||
- `raster/test/regress/rt_astiff`
|
||||
- `raster/test/regress/rt_asjpeg`
|
||||
- `raster/test/regress/rt_aspng`
|
||||
- `raster/test/regress/permitted_gdal_drivers`
|
||||
- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn`
|
||||
|
||||
### Topology Tests (v17 only)
|
||||
- `populate_topology_layer`
|
||||
- `renametopogeometrycolumn`
|
||||
|
||||
## Other Modifications
|
||||
|
||||
- Binary.sql tests are modified to use explicit file paths
|
||||
- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands
|
||||
- Upgrade tests are disabled
|
||||
9
docker-compose/ext-src/postgis-src/neon-test.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "$0")"
|
||||
if [[ ${PG_VERSION} = v17 ]]; then
|
||||
sed -i '/computed_columns/d' regress/core/tests.mk
|
||||
fi
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-no-upgrade-test.patch' EXIT
|
||||
make installcheck-base
|
||||
@@ -0,0 +1,21 @@
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index c051f03..010e493 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
198
docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
Normal file
@@ -0,0 +1,198 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..94903c3 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/geography \
|
||||
@@ -55,7 +53,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/long_xact \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
@@ -112,7 +109,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -144,11 +140,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index 1fc77ac..c3cb9de 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index 0ec5b2d..1c331f4 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
218
docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
Normal file
@@ -0,0 +1,218 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..a63a3e1 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
|
||||
POSTGIS_GEOS_VERSION=31101
|
||||
HAVE_JSON=yes
|
||||
HAVE_SPGIST=yes
|
||||
-INTERRUPTTESTS=yes
|
||||
+INTERRUPTTESTS=no
|
||||
|
||||
current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/frechet \
|
||||
@@ -60,7 +58,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
$(top_srcdir)/regress/core/measures \
|
||||
@@ -119,7 +116,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -143,8 +139,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index ac4f8ad..4bad4fc 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth \
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index cac4b2e..4c7c82b 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
diff --git a/topology/test/tests.mk b/topology/test/tests.mk
|
||||
index cbe2633..2c7c18f 100644
|
||||
--- a/topology/test/tests.mk
|
||||
+++ b/topology/test/tests.mk
|
||||
@@ -46,9 +46,7 @@ TESTS += \
|
||||
$(top_srcdir)/topology/test/regress/legacy_query.sql \
|
||||
$(top_srcdir)/topology/test/regress/legacy_validate.sql \
|
||||
$(top_srcdir)/topology/test/regress/polygonize.sql \
|
||||
- $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \
|
||||
$(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \
|
||||
- $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \
|
||||
$(top_srcdir)/topology/test/regress/renametopology.sql \
|
||||
$(top_srcdir)/topology/test/regress/share_sequences.sql \
|
||||
$(top_srcdir)/topology/test/regress/sqlmm.sql \
|
||||
46
docker-compose/ext-src/postgis-src/raster_outdb_template.sql
Normal file
17
docker-compose/ext-src/postgis-src/regular-test.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \
|
||||
-c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \
|
||||
-c "CREATE EXTENSION postgis SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_topology" \
|
||||
-c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
|
||||
-c "CREATE EXTENSION postgis_raster SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
patch -p1 <"postgis-regular-${PG_VERSION}.patch"
|
||||
psql -d contrib_regression -f raster_outdb_template.sql
|
||||
trap 'patch -R -p1 <postgis-no-upgrade-test.patch && patch -R -p1 <"postgis-regular-${PG_VERSION}.patch"' EXIT
|
||||
POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
|
||||
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
|
||||
installcheck:
|
||||
dropdb --if-exists contrib_regression
|
||||
createdb contrib_regression
|
||||
../alter_db.sh
|
||||
psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15"
|
||||
$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
|
||||
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
|
||||
installcheck:
|
||||
dropdb --if-exists contrib_regression
|
||||
createdb contrib_regression
|
||||
../alter_db.sh
|
||||
psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en"
|
||||
$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
|
||||
|
||||
@@ -3,5 +3,6 @@ set -ex
|
||||
cd "$(dirname ${0})"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
. ../alter_db.sh
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
|
||||
@@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898'
|
||||
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
|
||||
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
|
||||
control_plane_emergency_mode=true
|
||||
virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks
|
||||
|
||||
@@ -63,5 +63,9 @@ done
|
||||
for d in ${FAILED}; do
|
||||
cat "$(find $d -name regression.diffs)"
|
||||
done
|
||||
for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
echo "${postgis_diff}:"
|
||||
cat "${postgis_diff}"
|
||||
done
|
||||
echo "${FAILED}"
|
||||
exit 1
|
||||
|
||||
@@ -82,7 +82,8 @@ EXTENSIONS='[
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
|
||||
{"extname": "pgjwt", "extdir": "pgjwt-src"},
|
||||
{"extname": "pgtap", "extdir": "pgtap-src"},
|
||||
{"extname": "pg_repack", "extdir": "pg_repack-src"}
|
||||
{"extname": "pg_repack", "extdir": "pg_repack-src"},
|
||||
{"extname": "h3", "extdir": "h3-pg-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
|
||||
@@ -7,6 +7,8 @@ Author: Christian Schwarz
|
||||
|
||||
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
|
||||
|
||||
**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link).
|
||||
|
||||
# Motivation
|
||||
|
||||
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
|
||||
|
||||
194
docs/rfcs/043-bottom-most-gc-compaction.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# Bottommost Garbage-Collection Compaction
|
||||
|
||||
## Summary
|
||||
|
||||
The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future.
|
||||
|
||||
## Motivation
|
||||
|
||||
The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification.
|
||||
|
||||
# Basic Idea
|
||||
|
||||

|
||||
|
||||
The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process,
|
||||
|
||||
- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages.
|
||||
- We produce images for all keys involved in the compaction process at the GC horizon.
|
||||
|
||||
Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback).
|
||||
|
||||

|
||||
|
||||
The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.
|
||||
|
||||
# Branches
|
||||
|
||||
With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above.
|
||||
|
||||

|
||||
|
||||
## Single Timeline w/ Snapshots: handle `retain_lsn`
|
||||
|
||||
First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”).
|
||||
|
||||
The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below:
|
||||
|
||||
```
|
||||
LSN 0x10 -> A
|
||||
LSN 0x20 -> append B
|
||||
retain_lsn: 0x20
|
||||
LSN 0x30 -> append C
|
||||
LSN 0x40 -> append D
|
||||
retain_lsn: 0x40
|
||||
LSN 0x50 -> append E
|
||||
GC horizon: 0x50
|
||||
LSN 0x60 -> append F
|
||||
```
|
||||
|
||||
The algorithm will produce:
|
||||
|
||||
```
|
||||
LSN 0x20 -> AB
|
||||
(drop all history below the earliest retain_lsn)
|
||||
LSN 0x40 -> ABCD
|
||||
(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here)
|
||||
LSN 0x50 -> append E
|
||||
(replay one delta is cheap)
|
||||
LSN 0x60 -> append F
|
||||
(keep everything as-is above the GC horizon)
|
||||
```
|
||||
|
||||

|
||||
|
||||
What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped.
|
||||
|
||||
In the example above, the `$threshold` is 2.
|
||||
|
||||
## Child Branches with data: pull + partial images
|
||||
|
||||
In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that.
|
||||
|
||||
We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable.
|
||||
|
||||
```
|
||||
branch_lsn: 0x20
|
||||
LSN 0x30 -> append P
|
||||
LSN 0x40 -> append Q
|
||||
LSN 0x50 -> append R
|
||||
GC horizon: 0x50
|
||||
LSN 0x60 -> append S
|
||||
```
|
||||
|
||||
Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch.
|
||||
|
||||
```
|
||||
branch_lsn: 0x20
|
||||
LSN 0x50 -> ABPQR
|
||||
(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta)
|
||||
GC horizon: 0x50
|
||||
LSN 0x60 -> append S
|
||||
```
|
||||
|
||||

|
||||
|
||||
Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch.
|
||||
|
||||
# Result
|
||||
|
||||
Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before.
|
||||
|
||||
Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range
|
||||
|
||||
After: sum(min(logs for each key, image for each key))
|
||||
|
||||
# Compaction Trigger
|
||||
|
||||
The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)).
|
||||
|
||||
We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification.
|
||||
|
||||
Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon.
|
||||
|
||||
The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space.
|
||||
|
||||

|
||||
|
||||
The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space.
|
||||
|
||||

|
||||
|
||||
Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon.
|
||||
|
||||
The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**.
|
||||
|
||||
To reason about this trigger, consider the two cases:
|
||||
|
||||
**Data Ingestion**
|
||||
|
||||
User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written.
|
||||
|
||||

|
||||
|
||||
**Updates/Deletion**
|
||||
|
||||
In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification.
|
||||
|
||||

|
||||
|
||||
Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size.
|
||||
|
||||
The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor.
|
||||
|
||||
20GB layers → +20GB layers → delete 20GB, need 40GB temporary space
|
||||
|
||||
# Sub-Compactions
|
||||
|
||||
The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs.
|
||||
|
||||

|
||||
|
||||
As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5).
|
||||
|
||||
Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range.
|
||||
|
||||
# Implementation
|
||||
|
||||
The main implementation of gc-compaction is in `compaction.rs`.
|
||||
|
||||
* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range.
|
||||
* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files.
|
||||
* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible.
|
||||
* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried.
|
||||
* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction.
|
||||
* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information.
|
||||
|
||||
Gc-compaction can also be scheduled over the HTTP API. Example:
|
||||
|
||||
```
|
||||
curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }'
|
||||
```
|
||||
|
||||
The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map.
|
||||
|
||||
The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works).
|
||||
|
||||
# Next Steps
|
||||
|
||||
There are still some limitations of gc-compaction itself that needs to be resolved and tested,
|
||||
|
||||
- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging.
|
||||
- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones.
|
||||
- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history.
|
||||
- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long.
|
||||
- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process.
|
||||
- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction.
|
||||
- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer.
|
||||
- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history.
|
||||
|
||||
In the future,
|
||||
|
||||
- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN.
|
||||
- Tiered compaction on deltas: ensure read from any LSN is fast.
|
||||
- Per-timeline compaction → tenant-wide compaction?
|
||||
362
docs/rfcs/2025-04-30-direct-io-for-pageserver.md
Normal file
@@ -0,0 +1,362 @@
|
||||
# Direct IO For Pageserver
|
||||
|
||||
Date: Apr 30, 2025
|
||||
|
||||
## Summary
|
||||
|
||||
This document is a retroactive RFC. It
|
||||
- provides some background on what direct IO is,
|
||||
- motivates why Pageserver should be using it for its IO, and
|
||||
- describes how we changed Pageserver to use it.
|
||||
|
||||
The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR.
|
||||
|
||||
People primarily involved in this project were:
|
||||
- Yuchen Liang <yuchen@neon.tech>
|
||||
- Vlad Lazar <vlad@neon.tech>
|
||||
- Christian Schwarz <christian@neon.tech>
|
||||
|
||||
## Timeline
|
||||
|
||||
For posterity, here is the rough timeline of the development work that got us to where we are today.
|
||||
|
||||
- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API
|
||||
- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode
|
||||
- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks
|
||||
- Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests
|
||||
- Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users
|
||||
- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go.
|
||||
- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376))
|
||||
- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO
|
||||
- Apr 2025: develop & roll out direct IO for the write path
|
||||
|
||||
## Background: Terminology & Glossary
|
||||
|
||||
**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents.
|
||||
The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k).
|
||||
The cache lives in kernel memory and is not directly accessible through userspace.
|
||||
|
||||
**Buffered IO**: an application's read/write system calls go through the kernel page cache.
|
||||
For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents
|
||||
at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict
|
||||
a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes
|
||||
from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps
|
||||
track of the fact that the page is now "dirty" in some ancillary structure.
|
||||
|
||||
**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications
|
||||
made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel
|
||||
asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant
|
||||
ones are a) explicit request by userspace (`fsync`) and b) memory pressure.
|
||||
|
||||
**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity.
|
||||
If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations.
|
||||
Before reusing a page like that, the page has to be written back (writeback, see above).
|
||||
The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only
|
||||
way to get that memory is by eviction & re-using a dirty page cache page.
|
||||
Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`.
|
||||
I refer to this effect as the "malloc latency backscatter" caused by buffered IO.
|
||||
|
||||
**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem
|
||||
is still involved because it is ultimately in charge of mapping the concept of files & offsets within them
|
||||
to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers
|
||||
and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155).
|
||||
The IO operations will fail at runtime with EINVAL if the alignment requirements are not met.
|
||||
|
||||
**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and
|
||||
fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers,
|
||||
kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by
|
||||
the application.
|
||||
It takes more effort by the application to program with direct instead of buffered IO.
|
||||
The return is precise control over and a clear distinction between consumption/modification of memory vs disk.
|
||||
|
||||
**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache").
|
||||
Its caching unit is 8KiB blocks of the layer files written by Pageserver.
|
||||
A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer.
|
||||
The default size is tiny (64MiB), very much like Postgres's `shared_buffers`.
|
||||
We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year.
|
||||
|
||||
**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name.
|
||||
Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux.
|
||||
However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of
|
||||
IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`).
|
||||
|
||||
## Background: History Of Caching In Pageserver
|
||||
|
||||
For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO.
|
||||
It performed write-back to the kernel using buffered IO.
|
||||
|
||||
We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994).
|
||||
|
||||
The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers.
|
||||
The `PageCache` pages are usable as owned IO buffers.
|
||||
|
||||
We then started bypassing PageCache for user data blocks.
|
||||
Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets.
|
||||
The disk btree embedded in delta & image layers remains `PageCache`'d.
|
||||
Epics for that work were:
|
||||
- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright.
|
||||
- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks:
|
||||
- Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice)
|
||||
- InMemoryLayer
|
||||
- Compaction
|
||||
|
||||
The outcome of the above:
|
||||
1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache).
|
||||
2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`.
|
||||
|
||||
In production we size the PS `PageCache` to be 2GiB.
|
||||
Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines.
|
||||
High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS).
|
||||
The response to this is to migrate tenants away, or increase PS `PageCache` size.
|
||||
It is currently manual but could be automated, e.g., in Storage Controller.
|
||||
|
||||
In the future, we may eliminate the `PageCache` even for indirect blocks.
|
||||
For example with an LRU cache that has as unit the entire disk btree content
|
||||
instead of individual blocks.
|
||||
|
||||
## High-Level Design
|
||||
|
||||
So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache.
|
||||
We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem.
|
||||
This achieves the following system properties:
|
||||
|
||||
**Predictable VirtualFile latencies**
|
||||
* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss.
|
||||
* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure.
|
||||
* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe.
|
||||
But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree.
|
||||
* By switching to direct IO, above operations will have the (predictable) device latency -- always.
|
||||
Reads and appends always go to disk.
|
||||
And malloc will not have to write back dirty data.
|
||||
|
||||
**Explicitness & Tangibility of resource usage**
|
||||
* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant.
|
||||
* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control.
|
||||
* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?").
|
||||
* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that.
|
||||
|
||||
**CPU Efficiency**
|
||||
* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path.
|
||||
* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements.
|
||||
|
||||
The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are:
|
||||
- read latency improvements for repeat reads of the same data ("locality of reference")
|
||||
- asterisk: only if that state is still cache-resident by time of next access
|
||||
- write throughput by having kernel page cache batch small VFS writes into bigger disk writes
|
||||
- asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback
|
||||
|
||||
We are **happy to make this trade-off**:
|
||||
- Because of the advantages listed above.
|
||||
- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache.
|
||||
(At just 2GiB PS PageCache size, we average a 99.95% hit rate).
|
||||
So, the latency of going to disk is only for data block reads, not the index traversal.
|
||||
- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance).
|
||||
And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it.
|
||||
(See the appendix for a more detailed explanation why this is).
|
||||
- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before.
|
||||
|
||||
### Desired End State
|
||||
|
||||
The desired end state of the project is as follows, and with some asterisks, we have achieved it.
|
||||
|
||||
All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache.
|
||||
|
||||
In particular, the "data path" includes
|
||||
- the wal ingest path
|
||||
- compaction
|
||||
- anything on the `Timeline::get` / `Timeline::get_vectored` path.
|
||||
|
||||
The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache.
|
||||
Hit rate target is 99.95%.
|
||||
|
||||
There are no regressions to ingest latency.
|
||||
|
||||
The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`.
|
||||
We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO.
|
||||
Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO).
|
||||
|
||||
The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request.
|
||||
We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call.
|
||||
(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth).
|
||||
|
||||
## Design & Implementation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
A lot of prerequisite work had to happen to enable use of direct IO.
|
||||
|
||||
To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path:
|
||||
- page_service level server-side batching (config field `page_service_pipelining`)
|
||||
- concurrent IO (config field `get_vectored_concurrent_io`)
|
||||
The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376).
|
||||
Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799).
|
||||
The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`.
|
||||
The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC.
|
||||
|
||||
For the write path, and especially WAL ingest, we need to hide write latency.
|
||||
We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled
|
||||
buffer happen in a sidecar tokio task while new writes fill a new buffer.
|
||||
We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`.
|
||||
The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558).
|
||||
|
||||
### Ensuring Adherence to Alignment Requirements
|
||||
|
||||
Direct IO puts requirements on
|
||||
- memory buffer alignment
|
||||
- io size (=memory buffer size)
|
||||
- file offset alignment
|
||||
|
||||
The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!).
|
||||
|
||||
In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe).
|
||||
Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple.
|
||||
We made this decision because:
|
||||
- a) it is compatible with all the environments we need to run in
|
||||
- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart)
|
||||
- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower).
|
||||
- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO.
|
||||
|
||||
This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD).
|
||||
|
||||
The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements.
|
||||
All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits.
|
||||
Implementors of the marker traits are:
|
||||
- `IoBuffer` / `IoBufferMut`: used for most reads and writes
|
||||
- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!)
|
||||
|
||||
The alignment requirement is infectious; it permeates bottom-up throughout the code base.
|
||||
We stop the infection at roughly the same layers in the code base where we stopped permeating the
|
||||
use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing
|
||||
a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap.
|
||||
The places where we currently stop permeating are sort of arbitrary. For example, it would probably
|
||||
make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s.
|
||||
|
||||
The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors:
|
||||
- non-adherence to file offset alignment requirements
|
||||
- non-adherence to io size requirements
|
||||
|
||||
The following higher-level constructs ensure we meet the requirements:
|
||||
- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples.
|
||||
- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment.
|
||||
|
||||
Note that these types are used always, regardless of whether direct IO is enabled or not.
|
||||
There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512).
|
||||
But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO.
|
||||
|
||||
### Configuration / Feature Flagging
|
||||
|
||||
In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements.
|
||||
To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations.
|
||||
|
||||
We set `O_DIRECT` based on:
|
||||
- the VirtualFile API used to create/open the VirtualFile instance
|
||||
- the `virtual_file_io_mode` configuration flag
|
||||
- the OpenOptions `read` and/or `write` flags.
|
||||
|
||||
The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list.
|
||||
Other APIs never use `O_DIRECT`.
|
||||
(The name is bad and should really be `_maybe_direct_io`.)
|
||||
|
||||
The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path).
|
||||
At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available.
|
||||
|
||||
The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags.
|
||||
The result is the following runtime behavior:
|
||||
|
||||
|what|OpenOptions|`v_f_io_mode`<br/>=`buffered`|`v_f_io_mode`<br/>=`direct`|`v_f_io_mode`<br/>=`direct-rw`|
|
||||
|-|-|-|-|-|
|
||||
|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT|
|
||||
|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT|
|
||||
|`InMemoryLayer`|read + write|()|()*|O_DIRECT|
|
||||
|`DeltaLayerWriter`| write | () | () | O_DIRECT |
|
||||
|`ImageLayerWriter`| write | () | () | O_DIRECT |
|
||||
|`download_layer_file`|write |()|()|O_DIRECT|
|
||||
|
||||
The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`.
|
||||
That period was when we implemented and shipped the first version of `BufferedWriter`.
|
||||
We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`.
|
||||
The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later,
|
||||
in https://github.com/neondatabase/neon/pull/11558.
|
||||
|
||||
Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction.
|
||||
For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set.
|
||||
|
||||
## Correctness Validation
|
||||
|
||||
The correctness risks with this project were:
|
||||
- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation.
|
||||
These types expose an API that is largely identical to that of the `bytes` crate and/or Vec.
|
||||
- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path.
|
||||
|
||||
We sadly do not have infrastructure to run pageserver under `cargo miri`.
|
||||
So for memory safety issues, we relied on careful peer review.
|
||||
|
||||
We do assert the production-like alignment requirements in testing builds.
|
||||
However, these asserts were added retroactively.
|
||||
The actual validation before rollout happened in staging and pre-prod.
|
||||
We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite.
|
||||
I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements.
|
||||
Evidently developer testing was good enough.
|
||||
|
||||
## Performance Validation
|
||||
|
||||
The read path went through a lot of iterations of benchmarking in staging and pre-prod.
|
||||
The benchmarks in those environments demonstrated performance regressions early in the implementation.
|
||||
It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions.
|
||||
|
||||
The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns.
|
||||
|
||||
## Future Work
|
||||
|
||||
There is minor and major follow-up work that can be considered in the future.
|
||||
Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list.
|
||||
|
||||
Read Path:
|
||||
- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally.
|
||||
Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size
|
||||
and potentially also use that to drive placement decisions of shards from StorageController
|
||||
https://github.com/neondatabase/neon/issues/9288
|
||||
- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache.
|
||||
But even then, an estimation of the working set would be helpful to figure out caching strategy.
|
||||
|
||||
Write Path:
|
||||
- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129
|
||||
- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101
|
||||
- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692
|
||||
- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676
|
||||
|
||||
Both:
|
||||
- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster.
|
||||
This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts.
|
||||
However, padding latencies at microsecond scale is non-trivial.
|
||||
|
||||
Misc:
|
||||
- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write.
|
||||
Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use
|
||||
APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string`
|
||||
are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809
|
||||
|
||||
# Appendix
|
||||
|
||||
## Why Kernel Page Cache Is Ineffective At Tenant High Density
|
||||
|
||||
In the Motivation section, we stated:
|
||||
|
||||
> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance).
|
||||
|
||||
The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss.
|
||||
That's either sequential scans or random reads.
|
||||
A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available.
|
||||
It is complete waste to have the kernel page cache cache data blocks in this case.
|
||||
Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space.
|
||||
In such cases, the WAL records of those updates likely sit on the same delta layer block.
|
||||
When Compute does a sequential scan, it sends a series of single-page requests for these individual pages.
|
||||
When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit.
|
||||
This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching.
|
||||
We can either add a small per-connection LRU cache for such delta layer blocks.
|
||||
Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice.
|
||||
This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32).
|
||||
|
||||
There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these
|
||||
1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation)
|
||||
2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching).
|
||||
251
docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# Concurrent IO for Pageserver Read Path
|
||||
|
||||
Date: May 6, 2025
|
||||
|
||||
## Summary
|
||||
|
||||
This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025.
|
||||
|
||||
The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files
|
||||
_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete.
|
||||
|
||||
Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time
|
||||
contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`.
|
||||
|
||||
The motivation for why this work had to happen when it happened was the switch of Pageserver to
|
||||
- not cache user data blocks in PS PageCache and
|
||||
- switch to use direct IO.
|
||||
More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`.
|
||||
|
||||
### Refs
|
||||
|
||||
- Epic: https://github.com/neondatabase/neon/issues/9378
|
||||
- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002
|
||||
- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378
|
||||
|
||||
Design and implementation by:
|
||||
- Vlad Lazar <vlad@neon.tech>
|
||||
- Christian Schwarz <christian@neon.tech>
|
||||
|
||||
## Background & Motivation
|
||||
|
||||
The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps:
|
||||
- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`).
|
||||
- Pass these values to walredo to reconstruct the page images.
|
||||
|
||||
The read path used to be single-key but has been made multi-key some time ago.
|
||||
([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link))
|
||||
However, for simplicity, most of this doc will explain things in terms of a single key being requested.
|
||||
|
||||
The `Value` retrieval step above can be broken down into the following functions:
|
||||
- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction.
|
||||
- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk.
|
||||
The main job here is to coalesce the small value reads into larger filesystem-level read operations.
|
||||
This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.)
|
||||
Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done.
|
||||
- **Perform the read IO** using `tokio-epoll-uring`.
|
||||
|
||||
Before this project, above functions were sequentially interleaved, meaning:
|
||||
1. we would advance traversal, ...
|
||||
2. discover, that we need to read a value, ...
|
||||
3. read it from disk using `tokio-epoll-uring`, ...
|
||||
4. goto 1 unless we're done.
|
||||
|
||||
This meant that if N `Value`s need to be read to reconstruct a page,
|
||||
the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`.
|
||||
|
||||
## Design
|
||||
|
||||
The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before.
|
||||
But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution.
|
||||
After the last read from the last layer is submitted, we wait for the IOs to complete.
|
||||
|
||||
Assuming the filesystem / disk is able to actually process the submitted IOs without queuing,
|
||||
we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`.
|
||||
|
||||
Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe.
|
||||
Traversal will stall on on-demand layer download if a layer is not yet resident.
|
||||
It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index.
|
||||
|
||||
### Avoiding Waiting For IO During Traversal
|
||||
|
||||
The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized.
|
||||
|
||||
Before this project, traversal needed to perform IOs for the following:
|
||||
1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks.
|
||||
2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key,
|
||||
to determine whether the `Value::will_init` the page and therefore traversal can stop for this key.
|
||||
|
||||
The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%.
|
||||
(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.)
|
||||
|
||||
The solution for (2) is source `will_init` from the disk btree index keys, which fortunately
|
||||
already encode this bit of information since the introduction of the current storage/layer format.
|
||||
|
||||
### Concurrent IOs, Submission & Completion
|
||||
|
||||
To separate IO submission from waiting for its completion,
|
||||
we introduce the notion of an `IoConcurrency` struct through which IOs are issued.
|
||||
|
||||
An IO is an opaque future that
|
||||
- captures the `tx` side of a `oneshot` channel
|
||||
- performs the read IO by calling `VirtualFile::read_exact_at().await`
|
||||
- sending the result into the `tx`
|
||||
|
||||
Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct.
|
||||
|
||||
The traversal code that submits the IO stores the the corresponding `oneshot::Receiver`
|
||||
in the `VectoredValueReconstructState`, in the the place where we previously stored
|
||||
the sequentially read `img` and `records` fields.
|
||||
|
||||
When we're done with traversal, we wait for all submitted IOs:
|
||||
for each key, there is a future that awaits all the `oneshot::Receiver`s
|
||||
for that key, and then calls into walredo to reconstruct the page image.
|
||||
Walredo is now invoked concurrently for each value instead of sequentially.
|
||||
Walredo itself remains unchanged.
|
||||
|
||||
The spawned IO futures are driven to completion by a sidecar tokio task that
|
||||
is separate from the task that performs all the layer visiting and spawning of IOs.
|
||||
That tasks receives the IO futures via an unbounded mpsc channel and
|
||||
drives them to completion inside a `FuturedUnordered`.
|
||||
|
||||
### Error handling, Panics, Cancellation-Safety
|
||||
|
||||
There are two error classes during reconstruct data retrieval:
|
||||
* traversal errors: index lookup, move to next layer, and the like
|
||||
* value read IO errors
|
||||
|
||||
A traversal error fails the entire `get_vectored` request, as before this PR.
|
||||
A value read error only fails reconstruction of that value.
|
||||
|
||||
Panics and dropping of the `get_vectored` future before it completes
|
||||
leaves the sidecar task running and does not cancel submitted IOs
|
||||
(see next section for details on sidecar task lifecycle).
|
||||
All of this is safe, but, today's preference in the team is to close out
|
||||
all resource usage explicitly if possible, rather than cancelling + forgetting
|
||||
about it on drop. So, there is warning if we drop a
|
||||
`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs.
|
||||
|
||||
### Sidecar Task Lifecycle
|
||||
|
||||
The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct.
|
||||
The `IoConcurrency` object acts as a handle through which IO futures are submitted.
|
||||
|
||||
The spawned tokio task holds the `Timeline::gate` open.
|
||||
It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped.
|
||||
|
||||
Once the `IoConcurrency` struct is dropped, no new IO futures can come in
|
||||
but already submitted IO futures will be driven to completion regardless.
|
||||
We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe.
|
||||
But the underlying kernel and hardware resources are not magically freed up by that.
|
||||
So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete.
|
||||
Under normal conditions, this should be in the low hundreds of microseconds.
|
||||
|
||||
It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of
|
||||
tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack.
|
||||
The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to
|
||||
the (short-lived) functions/scope where we issue the IOs.
|
||||
We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)).
|
||||
For now, we just add another argument to the relevant code paths.
|
||||
|
||||
### Feature Gating
|
||||
|
||||
The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`.
|
||||
|
||||
The behavior from before this project is available through `IoConcurrency::Sequential`,
|
||||
which awaits the IO futures in place, without "spawning" or "submitting" them anywhere.
|
||||
|
||||
The `get_vectored_concurrent_io` pageserver config variable determines the runtime value,
|
||||
**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object.
|
||||
|
||||
### Alternatives Explored & Caveats Encountered
|
||||
|
||||
A few words on the rationale behind having a sidecar *task* and what
|
||||
alternatives were considered but abandoned.
|
||||
|
||||
#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work
|
||||
|
||||
We explored to not have a sidecar task, and instead have a `FuturesUnordered` per
|
||||
`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the
|
||||
first time after traversal is complete (i.e., at `collect_pending_ios`).
|
||||
|
||||
The obvious disadvantage, but not showstopper, is that we wouldn't be submitting
|
||||
IOs until traversal is complete.
|
||||
|
||||
The showstopper however, is that deadlocks happen if we don't drive the
|
||||
IO futures to completion independently of the traversal task.
|
||||
The reason is that both the IO futures and the traversal task may hold _some_,
|
||||
_and_ try to acquire _more_, shared limited resources.
|
||||
For example, both the travseral task and IO future may try to acquire
|
||||
* a `VirtualFile` file descriptor cache slot async mutex (observed during impl)
|
||||
* a `tokio-epoll-uring` submission slot (observed during impl)
|
||||
* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future)
|
||||
|
||||
#### Why We Don't Do `tokio::task`-per-IO-future
|
||||
|
||||
Another option is to spawn a short-lived `tokio::task` for each IO future.
|
||||
We implemented and benchmarked it during development, but found little
|
||||
throughput improvement and moderate mean & tail latency degradation.
|
||||
Concerns about pressure on the tokio scheduler led us to abandon this variant.
|
||||
|
||||
## Future Work
|
||||
|
||||
In addition to what is listed here, also check the "Punted" list in the epic:
|
||||
https://github.com/neondatabase/neon/issues/9378
|
||||
|
||||
### Enable `Timeline::get`
|
||||
|
||||
The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`.
|
||||
The impact is that roughly the following parts of pageserver do not benefit yet:
|
||||
- parts of basebackup
|
||||
- reads performed by the ingest path
|
||||
- most internal operations that read metadata keys (e.g. `collect_keyspace`!)
|
||||
|
||||
The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460
|
||||
|
||||
The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext).
|
||||
|
||||
Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given
|
||||
piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the
|
||||
place that puts the `IoConcurrency` into the `RequestContext`.
|
||||
We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some
|
||||
observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`.
|
||||
|
||||
### Concurrent On-Demand Downloads enabled by Detached Indices
|
||||
|
||||
As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index.
|
||||
Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695)
|
||||
we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example:
|
||||
- Move the `Layer::get_or_maybe_download().await` inside the IO futures.
|
||||
This goes in the opposite direction of the next "future work" item below, but it's easy to do.
|
||||
- Serve the IO future directly from object storage and dispatch the layer download
|
||||
to some other actor, e.g., an actor that is responsible for both downloads & eviction.
|
||||
|
||||
### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion
|
||||
|
||||
Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API
|
||||
that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission,
|
||||
and then wait for completion.
|
||||
|
||||
The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`.
|
||||
|
||||
A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full).
|
||||
While avoiding spending of CPU cycles on processing of completions while we're still traversing.
|
||||
|
||||
The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing.
|
||||
So, the submission part of the split API needs to process completions if squeue is full.
|
||||
|
||||
In any way, this split API is precondition for the bigger issue with the design presented here,
|
||||
which we dicsuss in the next section.
|
||||
|
||||
### Opaque Futures Are Brittle
|
||||
|
||||
The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating.
|
||||
However, we take on **brittleness** because callers must guarantee that the submitted futures are independent.
|
||||
By our experience, it is non-trivial to identify or rule out the interdependencies.
|
||||
See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details.
|
||||
|
||||
The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer")
|
||||
and get back a means to wait for completion.
|
||||
The subsystem can thereby reason by its own how operations may be related;
|
||||
unlike today, where the submitted opaque future can do just about anything.
|
||||
135
docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
Normal file
@@ -0,0 +1,135 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="82 284 863 375" width="863" height="375">
|
||||
<defs/>
|
||||
<g id="01-basic-idea" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>01-basic-idea</title>
|
||||
<rect fill="white" x="82" y="284" width="863" height="375"/>
|
||||
<g id="01-basic-idea_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_2">
|
||||
<rect x="234" y="379.5" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="234" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_3">
|
||||
<rect x="453.5" y="379.5" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="453.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_4">
|
||||
<rect x="672.5" y="379.5" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="672.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_5">
|
||||
<rect x="234" y="288.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="234" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_6">
|
||||
<rect x="375" y="288.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="375" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_7">
|
||||
<rect x="516" y="288.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="516" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_8">
|
||||
<rect x="657" y="288.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="657" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_9">
|
||||
<rect x="798" y="288.5" width="78" height="77.5" fill="white"/>
|
||||
<rect x="798" y="288.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_11">
|
||||
<line x1="185.5" y1="326.75" x2="943.7734" y2="326.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_12">
|
||||
<text transform="translate(87 318.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_13">
|
||||
<text transform="translate(106.41 372.886)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_14">
|
||||
<text transform="translate(121.92 289.578)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_15">
|
||||
<path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" fill="white"/>
|
||||
<path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<rect x="234" y="599.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="234" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<rect x="453.5" y="599.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="453.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_24">
|
||||
<rect x="672.5" y="599.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="672.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_23">
|
||||
<rect x="234" y="533" width="127" height="52.974" fill="white"/>
|
||||
<rect x="234" y="533" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_22">
|
||||
<rect x="375" y="533" width="310.5" height="52.974" fill="white"/>
|
||||
<rect x="375" y="533" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_21">
|
||||
<rect x="702.5" y="533" width="173.5" height="52.974" fill="white"/>
|
||||
<rect x="702.5" y="533" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_18">
|
||||
<line x1="185.5" y1="607.724" x2="943.7734" y2="607.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_16">
|
||||
<text transform="translate(121.92 538)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_27">
|
||||
<text transform="translate(114.8 592.86)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_28">
|
||||
<rect x="243.06836" y="300" width="624.3633" height="17.5" fill="#c0ffc0"/>
|
||||
<text transform="translate(248.06836 301.068)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_30">
|
||||
<rect x="243.06836" y="335.5" width="624.3633" height="17.5" fill="#c0ffff"/>
|
||||
<text transform="translate(248.06836 336.568)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_32">
|
||||
<rect x="243.06836" y="550.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
|
||||
<text transform="translate(248.06836 551.805)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_33">
|
||||
<rect x="304" y="630.474" width="485.5" height="28.447998" fill="#c0ffff"/>
|
||||
<text transform="translate(309 637.016)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_34">
|
||||
<text transform="translate(576.5 444.0325)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="11" xml:space="preserve">WAL replay of deltas+image below GC Horizon</tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="25.336" xml:space="preserve">Reshuffle deltas</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 8.1 KiB |
141
docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
Normal file
@@ -0,0 +1,141 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-104 215 863 335" width="863" height="335">
|
||||
<defs>
|
||||
<marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="#7f8080">
|
||||
<g>
|
||||
<path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
|
||||
</g>
|
||||
</marker>
|
||||
</defs>
|
||||
<g id="03-retain-lsn" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>03-retain-lsn</title>
|
||||
<rect fill="white" x="-104" y="215" width="863" height="335"/>
|
||||
<g id="03-retain-lsn_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_28">
|
||||
<rect x="48" y="477" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="48" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_27">
|
||||
<rect x="267.5" y="477" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="267.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<rect x="486.5" y="477" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="486.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_20">
|
||||
<line x1="-.5" y1="387.172" x2="757.7734" y2="387.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_19">
|
||||
<text transform="translate(-99 378.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_31">
|
||||
<rect x="48.25" y="410" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="48.25" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_30">
|
||||
<rect x="267.75" y="410" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="267.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_29">
|
||||
<rect x="486.75" y="410" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="486.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_34">
|
||||
<rect x="48.25" y="431.495" width="113.75" height="34" fill="white"/>
|
||||
<rect x="48.25" y="431.495" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_33">
|
||||
<rect x="172.5" y="431.495" width="203.5" height="34" fill="white"/>
|
||||
<rect x="172.5" y="431.495" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_32">
|
||||
<rect x="386.5" y="431.495" width="303.5" height="34" fill="white"/>
|
||||
<rect x="386.5" y="431.495" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_37">
|
||||
<rect x="48" y="498.495" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="48" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_36">
|
||||
<rect x="267.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="267.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_35">
|
||||
<rect x="486.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
|
||||
<rect x="486.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_38">
|
||||
<line x1="-10.48" y1="535.5395" x2="39.318294" y2="508.24794" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_39">
|
||||
<text transform="translate(-96.984 526.3155)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 1</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_41">
|
||||
<line x1="-10.48" y1="507.0915" x2="38.90236" y2="485.8992" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_40">
|
||||
<text transform="translate(-96.984 497.8675)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 2</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_43">
|
||||
<line x1="-10.48" y1="478.6435" x2="39.44267" y2="453.01616" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_42">
|
||||
<text transform="translate(-96.984 469.4195)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_45">
|
||||
<line x1="-10.48" y1="448.495" x2="39.65061" y2="419.90015" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_44">
|
||||
<text transform="translate(-96.984 439.271)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 4</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_46">
|
||||
<rect x="335.46477" y="215.5" width="353.4299" height="125.495" fill="white"/>
|
||||
<rect x="335.46477" y="215.5" width="353.4299" height="125.495" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_48">
|
||||
<text transform="translate(549.3766 317.547)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="6536993e-19" y="15" xml:space="preserve">Dependent Branch</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_50">
|
||||
<text transform="translate(340.43824 317.547)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_57">
|
||||
<line x1="323.90685" y1="248.8045" x2="714.9232" y2="248.8045" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_56">
|
||||
<text transform="translate(165.91346 240.0805)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="35811354e-19" y="15" xml:space="preserve">Branch GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_58">
|
||||
<rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
|
||||
<rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_59">
|
||||
<text transform="translate(358.9232 277.276)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Partial Image Coverage</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_60">
|
||||
<rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
|
||||
<rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 8.4 KiB |
@@ -0,0 +1,187 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 426 864 366" width="864" height="366">
|
||||
<defs/>
|
||||
<g id="05-btmgc-parent" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>05-btmgc-parent</title>
|
||||
<rect fill="white" x="-235" y="426" width="864" height="366"/>
|
||||
<g id="05-btmgc-parent_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_23">
|
||||
<rect x="-83" y="510.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-83" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-78 516.178)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="51.714" y="11" xml:space="preserve">Append C@0x30</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_22">
|
||||
<rect x="136.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="136.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_21">
|
||||
<rect x="355.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="355.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_20">
|
||||
<line x1="-100.448" y1="459.224" x2="626.77344" y2="459.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_19">
|
||||
<text transform="translate(-230 450.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_18">
|
||||
<rect x="-82.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-82.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-77.75 432.776)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.602" y="11" xml:space="preserve">Append F@0x60</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_17">
|
||||
<rect x="136.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="136.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_16">
|
||||
<rect x="355.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="355.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_15">
|
||||
<rect x="-82.75" y="464.645" width="113.75" height="34" fill="white"/>
|
||||
<rect x="-82.75" y="464.645" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-77.75 467.309)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.947" y="25.336" xml:space="preserve">Append D@0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_14">
|
||||
<rect x="41.5" y="464.645" width="203.5" height="34" fill="white"/>
|
||||
<rect x="41.5" y="464.645" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_13">
|
||||
<rect x="255.5" y="464.645" width="303.5" height="34" fill="white"/>
|
||||
<rect x="255.5" y="464.645" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_12">
|
||||
<rect x="-83" y="548.047" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-83" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-78 554.075)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="26.796" y="11" xml:space="preserve">A@0x10, Append B@0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_11">
|
||||
<rect x="136.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="136.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<rect x="355.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="355.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_24">
|
||||
<line x1="-104" y1="542" x2="610.5" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<text transform="translate(-139.604 534.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_28">
|
||||
<text transform="translate(-139.604 452.556)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_30">
|
||||
<line x1="-100.448" y1="481.145" x2="614.052" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_29">
|
||||
<text transform="translate(-139.604 473.449)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_48">
|
||||
<line x1="-99.448" y1="701.513" x2="627.77344" y2="701.513" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_47">
|
||||
<text transform="translate(-229 692.789)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_46">
|
||||
<rect x="-81.75" y="670.496" width="113.75" height="26.391998" fill="white"/>
|
||||
<rect x="-81.75" y="670.496" width="113.75" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-76.75 676.524)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.727" y="11" xml:space="preserve">Append F@0x60</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_43">
|
||||
<rect x="-81.75" y="708.393" width="113.75" height="34" fill="white"/>
|
||||
<rect x="-81.75" y="708.393" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-76.75 718.225)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_37">
|
||||
<line x1="-101" y1="777.2665" x2="613.5" y2="777.2665" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_36">
|
||||
<text transform="translate(-138.604 769.7665)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_33">
|
||||
<text transform="translate(-138.604 694.845)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_32">
|
||||
<line x1="-99.448" y1="755.089" x2="615.052" y2="755.089" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_31">
|
||||
<text transform="translate(-138.604 747.393)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_40">
|
||||
<rect x="-82" y="770.909" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="-82" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-77 770.7945)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_39">
|
||||
<rect x="137.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="137.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_38">
|
||||
<rect x="356.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="356.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_54">
|
||||
<rect x="-81.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="-81.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-76.75 748.421)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="62.28" y="11" xml:space="preserve">ABCD@0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_53">
|
||||
<rect x="137.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="137.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_52">
|
||||
<rect x="356.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="356.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_57">
|
||||
<path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" fill="white"/>
|
||||
<path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_60">
|
||||
<rect x="359" y="692.858" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="359" y="692.858" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_59">
|
||||
<rect x="41.5" y="693.858" width="303" height="14.107002" fill="white"/>
|
||||
<rect x="41.5" y="693.858" width="303" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 12 KiB |
@@ -0,0 +1,184 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-413 471 931 354" width="931" height="354">
|
||||
<defs/>
|
||||
<g id="06-btmgc-child" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>06-btmgc-child</title>
|
||||
<rect fill="white" x="-413" y="471" width="931" height="354"/>
|
||||
<g id="06-btmgc-child_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_47">
|
||||
<rect x="-412" y="594.402" width="928" height="28.447998" fill="white"/>
|
||||
<rect x="-412" y="594.402" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_46">
|
||||
<rect x="-205" y="555.552" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-205" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-200 561.58)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append P@0x30</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_45">
|
||||
<rect x="14.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="14.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_44">
|
||||
<rect x="233.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="233.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_43">
|
||||
<line x1="-222.448" y1="504.724" x2="504.77344" y2="504.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_42">
|
||||
<text transform="translate(-352 496)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_41">
|
||||
<rect x="-204.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-204.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-199.75 478.178)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append S@0x60</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_40">
|
||||
<rect x="14.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="14.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_39">
|
||||
<rect x="233.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="233.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_38">
|
||||
<rect x="-204.75" y="510.047" width="113.75" height="34" fill="white"/>
|
||||
<rect x="-204.75" y="510.047" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-199.75 512.711)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.061" y="11" xml:space="preserve">Append R@0x50</tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.611" y="25.336" xml:space="preserve">Append Q@0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_37">
|
||||
<rect x="-80.5" y="510.047" width="203.5" height="34" fill="white"/>
|
||||
<rect x="-80.5" y="510.047" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_36">
|
||||
<rect x="133.5" y="510.047" width="303.5" height="34" fill="white"/>
|
||||
<rect x="133.5" y="510.047" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_33">
|
||||
<text transform="translate(-261.604 498.056)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_30">
|
||||
<line x1="-224" y1="607.9115" x2="490.5" y2="607.9115" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_29">
|
||||
<text transform="translate(-261.604 600.4115)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_28">
|
||||
<rect x="-205" y="601.554" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="-205" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-200 601.4395)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_27">
|
||||
<rect x="14.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="14.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<rect x="233.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="233.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<text transform="translate(-407 599.1875)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_24">
|
||||
<rect x="-411" y="795.46" width="928" height="28.447998" fill="white"/>
|
||||
<rect x="-411" y="795.46" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_20">
|
||||
<line x1="-221.448" y1="755.528" x2="505.77344" y2="755.528" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_19">
|
||||
<text transform="translate(-351 746.804)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_18">
|
||||
<rect x="-203.75" y="723.579" width="203.25" height="26.391998" fill="white"/>
|
||||
<rect x="-203.75" y="723.579" width="203.25" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-198.75 729.607)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.033" y="11" xml:space="preserve">Append S@0x60</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<text transform="translate(-260.604 748.86)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_7">
|
||||
<line x1="-223" y1="808.9695" x2="491.5" y2="808.9695" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_6">
|
||||
<text transform="translate(-260.604 801.4695)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_5">
|
||||
<rect x="-204" y="802.612" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="-204" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-199 802.4975)" fill="#b1001c">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="70.836" y="11" xml:space="preserve">AB</tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">@0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_4">
|
||||
<rect x="15.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="15.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_3">
|
||||
<rect x="234.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
|
||||
<rect x="234.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_2">
|
||||
<text transform="translate(-406 800.2455)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_48">
|
||||
<path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" fill="white"/>
|
||||
<path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_49">
|
||||
<rect x="-204" y="762.428" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-204" y="762.428" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-199 768.456)" fill="#b1001c">
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="58.278" y="11" xml:space="preserve">AB</tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">PQR@0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_59">
|
||||
<rect x="14.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="14.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_58">
|
||||
<rect x="233.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="233.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_63">
|
||||
<rect x="9" y="762.085" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="9" y="762.085" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_62">
|
||||
<rect x="225" y="762.085" width="213" height="26.391998" fill="white"/>
|
||||
<rect x="225" y="762.085" width="213" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 12 KiB |
@@ -0,0 +1,180 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-556 476 923 411" width="923" height="411">
|
||||
<defs/>
|
||||
<g id="07-btmgc-analysis-1" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>07-btmgc-analysis-1</title>
|
||||
<rect fill="white" x="-556" y="476" width="923" height="411"/>
|
||||
<g id="07-btmgc-analysis-1_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_85">
|
||||
<rect x="-404" y="609.062" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-404" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_84">
|
||||
<rect x="-184.5" y="609.062" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-184.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_83">
|
||||
<rect x="34.5" y="609.062" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="34.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_82">
|
||||
<rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_81">
|
||||
<rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_80">
|
||||
<rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_79">
|
||||
<rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_78">
|
||||
<rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
|
||||
<rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_77">
|
||||
<line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_76">
|
||||
<text transform="translate(-551 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_75">
|
||||
<text transform="translate(-531.59 602.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_74">
|
||||
<text transform="translate(-516.08 481)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_73">
|
||||
<path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
|
||||
<path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_72">
|
||||
<rect x="-403.8" y="827.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-403.8" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_71">
|
||||
<rect x="-184.3" y="827.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-184.3" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_70">
|
||||
<rect x="34.7" y="827.474" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="34.7" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_69">
|
||||
<rect x="-403.8" y="761" width="127" height="52.974" fill="white"/>
|
||||
<rect x="-403.8" y="761" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_68">
|
||||
<rect x="-262.8" y="761" width="310.5" height="52.974" fill="white"/>
|
||||
<rect x="-262.8" y="761" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_67">
|
||||
<rect x="64.7" y="761" width="173.5" height="52.974" fill="white"/>
|
||||
<rect x="64.7" y="761" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_66">
|
||||
<line x1="-452.3" y1="835.724" x2="251.2" y2="835.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_65">
|
||||
<text transform="translate(-515.88 766)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_64">
|
||||
<text transform="translate(-523 820.86)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
|
||||
<tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_63">
|
||||
<rect x="-394.93164" y="491.422" width="624.3633" height="17.5" fill="#c0ffc0"/>
|
||||
<text transform="translate(-389.93164 492.49)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_62">
|
||||
<rect x="-394.93164" y="526.922" width="624.3633" height="17.5" fill="#c0ffff"/>
|
||||
<text transform="translate(-389.93164 527.99)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_61">
|
||||
<rect x="-394.73164" y="778.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
|
||||
<text transform="translate(-389.73164 779.805)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_60">
|
||||
<rect x="-333.8" y="858.474" width="485.5" height="28.447998" fill="#c0ffff"/>
|
||||
<text transform="translate(-328.8 865.016)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_86">
|
||||
<text transform="translate(263 499.724)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="32" fill="black" x="0" y="30" xml:space="preserve">size=A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_87">
|
||||
<line x1="260.87012" y1="479.068" x2="360.71387" y2="479.068" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_88">
|
||||
<line x1="260.87012" y1="561" x2="360.71387" y2="561" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_89">
|
||||
<rect x="-403.8" y="569" width="161.8" height="28.447998" fill="white"/>
|
||||
<rect x="-403.8" y="569" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_90">
|
||||
<rect x="-229.5" y="569.018" width="277.2" height="28.447998" fill="white"/>
|
||||
<rect x="-229.5" y="569.018" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_91">
|
||||
<rect x="64.7" y="569.018" width="173.5" height="28.447998" fill="white"/>
|
||||
<rect x="64.7" y="569.018" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_92">
|
||||
<line x1="262" y1="602" x2="361.84375" y2="602" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_93">
|
||||
<line x1="263" y1="625.562" x2="362.84375" y2="625.562" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_94">
|
||||
<text transform="translate(264.53787 562.276)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="32" fill="black" x="14210855e-21" y="30" xml:space="preserve">size=B</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_95">
|
||||
<text transform="translate(285.12 599.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="24" fill="black" x="0" y="23" xml:space="preserve">size=C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_98">
|
||||
<text transform="translate(264.53787 773.772)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="26" fill="black" x="8881784e-19" y="25" xml:space="preserve">A</tspan>
|
||||
<tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_97">
|
||||
<text transform="translate(265.87013 815.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
|
||||
<tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 11 KiB |
@@ -0,0 +1,158 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 406 586 424" width="586" height="424">
|
||||
<defs/>
|
||||
<g id="08-optimization" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>08-optimization</title>
|
||||
<rect fill="white" x="-235" y="406" width="586" height="424"/>
|
||||
<g id="08-optimization_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_22">
|
||||
<rect x="-100.448" y="509.902" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.448" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_21">
|
||||
<rect x="118.552" y="509.902" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.552" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_20">
|
||||
<line x1="-101.79572" y1="420.322" x2="349.5" y2="420.322" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_19">
|
||||
<text transform="translate(-230 411.598)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_17">
|
||||
<rect x="-100.198" y="426.5" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.198" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_16">
|
||||
<rect x="118.802" y="426.5" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.802" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_14">
|
||||
<rect x="-100.198" y="464.397" width="108.25" height="34" fill="white"/>
|
||||
<rect x="-100.198" y="464.397" width="108.25" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_13">
|
||||
<rect x="18.552" y="464.397" width="303.5" height="34" fill="white"/>
|
||||
<rect x="18.552" y="464.397" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_11">
|
||||
<rect x="-100.448" y="547.799" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.448" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<rect x="118.552" y="547.799" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.552" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_24">
|
||||
<line x1="-104" y1="542" x2="339.4011" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<text transform="translate(-139.604 534.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_27">
|
||||
<line x1="-101.79572" y1="459.098" x2="341.6054" y2="459.098" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<text transform="translate(-139.604 451.402)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_28">
|
||||
<text transform="translate(-139.604 413.654)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x60</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_30">
|
||||
<line x1="-101.79572" y1="481.145" x2="341.6054" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_29">
|
||||
<text transform="translate(-139.604 473.449)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_77">
|
||||
<rect x="-100.448" y="765.19595" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.448" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_76">
|
||||
<rect x="118.552" y="765.19595" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.552" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_75">
|
||||
<line x1="-101.79572" y1="637.317" x2="349.5" y2="637.317" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_74">
|
||||
<text transform="translate(-230 628.593)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_73">
|
||||
<rect x="-100.198" y="681.794" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.198" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_72">
|
||||
<rect x="118.802" y="681.794" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.802" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_71">
|
||||
<rect x="-100.198" y="719.69096" width="108.25" height="34" fill="white"/>
|
||||
<rect x="-100.198" y="719.69096" width="108.25" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_70">
|
||||
<rect x="18.552" y="719.69096" width="303.5" height="34" fill="white"/>
|
||||
<rect x="18.552" y="719.69096" width="303.5" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_69">
|
||||
<rect x="-100.448" y="803.09295" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="-100.448" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_68">
|
||||
<rect x="118.552" y="803.09295" width="203.5" height="26.391998" fill="white"/>
|
||||
<rect x="118.552" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_67">
|
||||
<line x1="-104" y1="797.294" x2="339.4011" y2="797.294" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_66">
|
||||
<text transform="translate(-139.604 789.794)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_63">
|
||||
<text transform="translate(-139.604 630.649)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x70</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_62">
|
||||
<line x1="-101.79572" y1="736.439" x2="341.6054" y2="736.439" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_61">
|
||||
<text transform="translate(-139.604 728.743)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_79">
|
||||
<rect x="-100.198" y="644.393" width="168.198" height="26.391998" fill="white"/>
|
||||
<rect x="-100.198" y="644.393" width="168.198" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_78">
|
||||
<rect x="80" y="644.393" width="242.302" height="26.391998" fill="white"/>
|
||||
<rect x="80" y="644.393" width="242.302" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_81">
|
||||
<line x1="-101.79572" y1="714.139" x2="341.6054" y2="714.139" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="1.0,4.0" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_80">
|
||||
<text transform="translate(-139.604 706.443)" fill="#a5a5a5">
|
||||
<tspan font-family="Helvetica Neue" font-size="14" fill="#a5a5a5" x="0" y="13" xml:space="preserve">0x50</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 9.4 KiB |
@@ -0,0 +1,184 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-562 479 876 429" width="876" height="429">
|
||||
<defs/>
|
||||
<g id="09-btmgc-analysis-2" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>09-btmgc-analysis-2</title>
|
||||
<rect fill="white" x="-562" y="479" width="876" height="429"/>
|
||||
<g id="09-btmgc-analysis-2_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_85">
|
||||
<rect x="-404" y="622.386" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-404" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-399 621.912)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_84">
|
||||
<rect x="-184.5" y="622.386" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="-184.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-179.5 621.912)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_83">
|
||||
<rect x="34.5" y="622.386" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="34.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(39.5 621.912)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_82">
|
||||
<rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-399 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_81">
|
||||
<rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-258 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_80">
|
||||
<rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-117 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_79">
|
||||
<rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
|
||||
<rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(24 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_78">
|
||||
<rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
|
||||
<rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(165 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_77">
|
||||
<line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_76">
|
||||
<text transform="translate(-551 509.448)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_73">
|
||||
<path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
|
||||
<path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_89">
|
||||
<rect x="-403.8" y="582.324" width="161.8" height="28.447998" fill="white"/>
|
||||
<rect x="-403.8" y="582.324" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-398.8 587.324)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="70.42" y="15" xml:space="preserve">B</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_90">
|
||||
<rect x="-229.5" y="582.342" width="277.2" height="28.447998" fill="white"/>
|
||||
<rect x="-229.5" y="582.342" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-224.5 587.342)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="128.12" y="15" xml:space="preserve">B</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_91">
|
||||
<rect x="64.7" y="582.342" width="173.5" height="28.447998" fill="white"/>
|
||||
<rect x="64.7" y="582.342" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(69.7 587.342)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_97">
|
||||
<rect x="-403.8" y="564.842" width="490.8" height="12.157997" fill="white"/>
|
||||
<rect x="-403.8" y="564.842" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-398.8 561.697)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_109">
|
||||
<rect x="28.6" y="889.964" width="203.5" height="17.5" fill="white"/>
|
||||
<rect x="28.6" y="889.964" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(33.6 889.49)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_108">
|
||||
<rect x="-409.9" y="747.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-409.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-404.9 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_107">
|
||||
<rect x="-268.9" y="747.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-268.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-263.9 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_106">
|
||||
<rect x="-127.9" y="747.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="-127.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-122.9 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_105">
|
||||
<rect x="13.1" y="747.5" width="127" height="77.5" fill="white"/>
|
||||
<rect x="13.1" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(18.1 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_104">
|
||||
<rect x="154.1" y="747.5" width="78" height="77.5" fill="white"/>
|
||||
<rect x="154.1" y="747.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(159.1 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_103">
|
||||
<line x1="-458.4" y1="785.75" x2="245.1" y2="785.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_102">
|
||||
<text transform="translate(-556.9 777.026)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_99">
|
||||
<rect x="58.8" y="849.92" width="173.5" height="28.447998" fill="white"/>
|
||||
<rect x="58.8" y="849.92" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(63.8 854.92)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_98">
|
||||
<rect x="-409.7" y="832.42" width="490.8" height="12.157997" fill="white"/>
|
||||
<rect x="-409.7" y="832.42" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(-404.7 829.275)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_112">
|
||||
<text transform="translate(273 797.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
|
||||
<tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_113">
|
||||
<text transform="translate(273 833.974)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="26" fill="black" x="42277293e-20" y="25" xml:space="preserve">C</tspan>
|
||||
<tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 12 KiB |
@@ -0,0 +1,81 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
|
||||
<defs/>
|
||||
<g id="10-btmgc-analysis-3" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>10-btmgc-analysis-3</title>
|
||||
<rect fill="white" x="-12" y="920" width="809" height="269"/>
|
||||
<g id="10-btmgc-analysis-3_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_13">
|
||||
<rect x="433.7" y="949" width="63.559346" height="77.5" fill="white"/>
|
||||
<rect x="433.7" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(438.7 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_12">
|
||||
<rect x="503.7654" y="949" width="63.559346" height="77.5" fill="white"/>
|
||||
<rect x="503.7654" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(508.7654 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_11">
|
||||
<rect x="574.8318" y="949" width="63.559346" height="77.5" fill="white"/>
|
||||
<rect x="574.8318" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(579.8318 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<rect x="645.3977" y="949" width="63.559346" height="77.5" fill="white"/>
|
||||
<rect x="645.3977" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(650.3977 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_8">
|
||||
<line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_7">
|
||||
<text transform="translate(-6.500003 925.552)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_2">
|
||||
<rect x="113.2" y="1033.92" width="321.3" height="12.157997" fill="white"/>
|
||||
<rect x="113.2" y="1033.92" width="321.3" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(118.2 1030.775)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="150.762" y="15" xml:space="preserve">X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_17">
|
||||
<path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
|
||||
<path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_27">
|
||||
<line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<text transform="translate(-5.5000034 1155.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
|
||||
<rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(119 1170.355)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="306.564" y="15" xml:space="preserve">2X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_33">
|
||||
<rect x="715.96355" y="949" width="63.559346" height="77.5" fill="white"/>
|
||||
<rect x="715.96355" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(720.96355 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.1 KiB |
@@ -0,0 +1,81 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
|
||||
<defs/>
|
||||
<g id="11-btmgc-analysis-4" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
|
||||
<title>11-btmgc-analysis-4</title>
|
||||
<rect fill="white" x="-12" y="920" width="809" height="269"/>
|
||||
<g id="11-btmgc-analysis-4_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_13">
|
||||
<rect x="113" y="949" width="127" height="77.5" fill="white"/>
|
||||
<rect x="113" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(118 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_12">
|
||||
<rect x="253" y="949" width="127" height="77.5" fill="white"/>
|
||||
<rect x="253" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(258 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_11">
|
||||
<rect x="395" y="949" width="127" height="77.5" fill="white"/>
|
||||
<rect x="395" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(400 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<rect x="536" y="949" width="127" height="77.5" fill="white"/>
|
||||
<rect x="536" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(541 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_9">
|
||||
<rect x="677" y="949" width="78" height="77.5" fill="white"/>
|
||||
<rect x="677" y="949" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(682 978.526)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="14.584" y="15" xml:space="preserve">1/5 D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_8">
|
||||
<line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_7">
|
||||
<text transform="translate(-6.500003 925.552)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_2">
|
||||
<rect x="113.2" y="1033.92" width="641.8" height="12.157997" fill="white"/>
|
||||
<rect x="113.2" y="1033.92" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(118.2 1030.775)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_17">
|
||||
<path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
|
||||
<path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_27">
|
||||
<line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_26">
|
||||
<text transform="translate(-5.5000034 1155.5)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_25">
|
||||
<rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
|
||||
<rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(119 1170.355)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.0 KiB |
|
After Width: | Height: | Size: 142 KiB |
176
docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg
Normal file
@@ -0,0 +1,176 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg version="1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" viewBox="210 271 870 514" width="870" height="514">
|
||||
<defs/>
|
||||
<g id="gc-compaction-split" stroke-dasharray="none" fill-opacity="1" stroke="none" fill="none" stroke-opacity="1">
|
||||
<title>gc-compaction-split</title>
|
||||
<rect fill="white" x="210" y="271" width="870" height="514"/>
|
||||
<g id="gc-compaction-split_Layer_1">
|
||||
<title>Layer 1</title>
|
||||
<g id="Graphic_12">
|
||||
<rect x="241" y="272" width="213" height="50.5" fill="white"/>
|
||||
<rect x="241" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_11">
|
||||
<rect x="468.72266" y="272" width="213" height="50.5" fill="white"/>
|
||||
<rect x="468.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_10">
|
||||
<rect x="695.72266" y="272" width="213" height="50.5" fill="white"/>
|
||||
<rect x="695.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_9">
|
||||
<rect x="241" y="337.3711" width="303.5" height="50.5" fill="white"/>
|
||||
<rect x="241" y="337.3711" width="303.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_8">
|
||||
<rect x="556.2617" y="337.3711" width="352.46094" height="50.5" fill="white"/>
|
||||
<rect x="556.2617" y="337.3711" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_7">
|
||||
<rect x="241" y="402.7422" width="667.72266" height="50.5" fill="white"/>
|
||||
<rect x="241" y="402.7422" width="667.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_6">
|
||||
<line x1="211" y1="355.5" x2="947.4961" y2="355.5" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_5">
|
||||
<text transform="translate(952.4961 346.776)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_4">
|
||||
<line x1="212" y1="438.5182" x2="948.4961" y2="438.5182" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_3">
|
||||
<text transform="translate(953.4961 429.7942)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_13">
|
||||
<rect x="241" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
|
||||
<text transform="translate(246 353.3971)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 1</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_57">
|
||||
<rect x="359" y="647.96484" width="551.72266" height="50.5" fill="white"/>
|
||||
<rect x="359" y="647.96484" width="551.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_54">
|
||||
<rect x="359" y="517.22266" width="96" height="50.5" fill="white"/>
|
||||
<rect x="359" y="517.22266" width="96" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_53">
|
||||
<rect x="469.72266" y="517.22266" width="213" height="50.5" fill="white"/>
|
||||
<rect x="469.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_52">
|
||||
<rect x="696.72266" y="517.22266" width="213" height="50.5" fill="white"/>
|
||||
<rect x="696.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_51">
|
||||
<rect x="359" y="582.59375" width="186.5" height="50.5" fill="white"/>
|
||||
<rect x="359" y="582.59375" width="186.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_50">
|
||||
<rect x="557.2617" y="582.59375" width="352.46094" height="50.5" fill="white"/>
|
||||
<rect x="557.2617" y="582.59375" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Line_49">
|
||||
<line x1="212" y1="600.72266" x2="948.4961" y2="600.72266" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_48">
|
||||
<text transform="translate(953.4961 591.99866)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Line_47">
|
||||
<line x1="213" y1="683.74084" x2="949.4961" y2="683.74084" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_46">
|
||||
<text transform="translate(954.4961 675.01685)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_63">
|
||||
<rect x="376.72525" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
|
||||
<text transform="translate(381.72525 353.3971)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 2</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_64">
|
||||
<rect x="511.39405" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
|
||||
<text transform="translate(516.39405 353.3971)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 3</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_65">
|
||||
<rect x="646.06285" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
|
||||
<text transform="translate(651.06285 353.3971)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 4</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_66">
|
||||
<rect x="780.73165" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
|
||||
<text transform="translate(785.73165 353.3971)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 5</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_56">
|
||||
<rect x="243.5" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
|
||||
<rect x="243.5" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_55">
|
||||
<rect x="243.5" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="243.5" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_68">
|
||||
<rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
|
||||
<rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_67">
|
||||
<rect x="379.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="379.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_70">
|
||||
<rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
|
||||
<rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_69">
|
||||
<rect x="514.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="514.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_72">
|
||||
<rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
|
||||
<rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_71">
|
||||
<rect x="649.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="649.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_74">
|
||||
<rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
|
||||
<rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_73">
|
||||
<rect x="785.23165" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="785.23165" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
</g>
|
||||
<g id="Graphic_78">
|
||||
<rect x="241" y="731.3359" width="125.49101" height="27.26953" fill="#ccc"/>
|
||||
<rect x="241" y="731.3359" width="125.49101" height="27.26953" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(246 735.7467)" fill="black">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="black" x="17.297502" y="15" xml:space="preserve">Delta Layer</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g id="Graphic_79">
|
||||
<rect x="241" y="766.759" width="125.49101" height="17.5" fill="#6b7ca5"/>
|
||||
<rect x="241" y="766.759" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
|
||||
<text transform="translate(246 766.285)" fill="white">
|
||||
<tspan font-family="Helvetica Neue" font-size="16" fill="white" x="13.737502" y="15" xml:space="preserve">Image Layer</tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 11 KiB |
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
if var(REAL_S3_ENV).is_ok() {
|
||||
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
assert!(body.contains("process_threads"));
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
use regex::Regex;
|
||||
@@ -178,9 +179,9 @@ pub struct ComputeSpec {
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
pub prewarm_lfc_on_startup: bool,
|
||||
pub autoprewarm: bool,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -192,6 +193,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable TLS functionality.
|
||||
TlsExperimental,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
@@ -250,34 +254,44 @@ impl RemoteExtSpec {
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
Some(_ext_data) => {
|
||||
// We have decided to use the Go naming convention due to Kubernetes.
|
||||
|
||||
let arch = match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
};
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
let archive_path_str = format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
|
||||
);
|
||||
Ok((
|
||||
real_ext_name.to_string(),
|
||||
RemotePath::from_string(&archive_path_str)?,
|
||||
))
|
||||
}
|
||||
Some(_ext_data) => Ok((
|
||||
real_ext_name.to_string(),
|
||||
Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the architecture-specific portion of the remote extension path. We
|
||||
/// use the Go naming convention due to Kubernetes.
|
||||
fn get_arch() -> &'static str {
|
||||
match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a [`RemotePath`] for an extension.
|
||||
fn build_remote_path(
|
||||
build_tag: &str,
|
||||
pg_major_version: &str,
|
||||
ext_name: &str,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let arch = Self::get_arch();
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
RemotePath::from_string(&format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
@@ -306,6 +320,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
@@ -518,6 +538,37 @@ mod tests {
|
||||
.expect("Library should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remote_extension_path() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"extlib": "ext",
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let (_ext_name, ext_path) = rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
// Starting with a forward slash would have consequences for the
|
||||
// Url::join() that occurs when downloading a remote extension.
|
||||
assert!(!ext_path.to_string().starts_with("/"));
|
||||
assert_eq!(
|
||||
ext_path,
|
||||
RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "prewarm_lfc_on_startup",
|
||||
"name": "autoprewarm",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
|
||||
@@ -107,7 +107,7 @@ impl<const N: usize> MetricType for HyperLogLogState<N> {
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogState<N> {
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
pub fn measure(&self, item: &(impl Hash + ?Sized)) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub use prometheus::{
|
||||
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use prometheus;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
||||
|
||||
18
libs/neon-shmem/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace = true
|
||||
spin.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
366
libs/neon-shmem/src/hash.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
//! Hash table implementation on top of 'shmem'
|
||||
//!
|
||||
//! Features required in the long run by the communicator project:
|
||||
//!
|
||||
//! [X] Accessible from both Postgres processes and rust threads in the communicator process
|
||||
//! [X] Low latency
|
||||
//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
|
||||
//! [ ] Resizable
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use spin;
|
||||
|
||||
mod core;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use core::CoreHashMap;
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub struct HashMapInit<'a, K, V> {
|
||||
// Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
pub struct HashMapAccess<'a, K, V> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
|
||||
unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V> {
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
|
||||
// no difference to attach_writer currently
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
// This is stored in the shared memory area
|
||||
struct HashMapShared<'a, K, V> {
|
||||
inner: spin::RwLock<CoreHashMap<'a, K, V>>,
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
pub fn init_in_fixed_area(
|
||||
num_buckets: u32,
|
||||
area: &'a mut [MaybeUninit<u8>],
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
|
||||
}
|
||||
|
||||
/// Initialize a new hash map in the given shared memory area
|
||||
pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
|
||||
let ptr = unsafe { shmem.data_ptr.as_mut() };
|
||||
Self::init_common(num_buckets, Some(shmem), ptr, size)
|
||||
}
|
||||
|
||||
fn init_common(
|
||||
num_buckets: u32,
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
area_ptr: *mut u8,
|
||||
area_len: usize,
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
// carve out HashMapShared from the area. This does not include the hashmap's dictionary
|
||||
// and buckets.
|
||||
let mut ptr: *mut u8 = area_ptr;
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// the rest of the space is given to the hash map's dictionary and buckets
|
||||
let remaining_area = unsafe {
|
||||
std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
|
||||
};
|
||||
|
||||
let hashmap = CoreHashMap::new(num_buckets, remaining_area);
|
||||
unsafe {
|
||||
std::ptr::write(
|
||||
shared_ptr,
|
||||
HashMapShared {
|
||||
inner: spin::RwLock::new(hashmap),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
HashMapInit {
|
||||
shmem_handle: shmem_handle,
|
||||
shared_ptr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapAccess<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get(key) {
|
||||
None => None,
|
||||
Some(val_ref) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(&self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get(key);
|
||||
let action = value_fn(old_val);
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(new_val)) => {
|
||||
let _ = lock_guard.insert(key, new_val);
|
||||
}
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some(_), UpdateAction::Remove) => {
|
||||
let _ = lock_guard.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn_at_bucket<F>(
|
||||
&self,
|
||||
pos: usize,
|
||||
value_fn: F,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get_bucket(pos);
|
||||
let action = value_fn(old_val.map(|(_k, v)| v));
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some((key, _value)), UpdateAction::Remove) => {
|
||||
let key = key.clone();
|
||||
let _ = lock_guard.remove(&key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map. (An Iterator might be nicer. The communicator's
|
||||
/// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
|
||||
/// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
|
||||
pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get_bucket(pos) {
|
||||
None => None,
|
||||
Some((_key, val_ref)) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for metrics
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().buckets_in_use as usize
|
||||
}
|
||||
|
||||
/// Grow
|
||||
///
|
||||
/// 1. grow the underlying shared memory area
|
||||
/// 2. Initialize new buckets. This overwrites the current dictionary
|
||||
/// 3. Recalculate the dictionary
|
||||
pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
let inner = &mut *lock_guard;
|
||||
let old_num_buckets = inner.buckets.len() as u32;
|
||||
|
||||
if num_buckets < old_num_buckets {
|
||||
panic!("grow called with a smaller number of buckets");
|
||||
}
|
||||
if num_buckets == old_num_buckets {
|
||||
return Ok(());
|
||||
}
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("grow called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
|
||||
// the dictionary!
|
||||
let buckets_ptr = inner.buckets.as_mut_ptr();
|
||||
unsafe {
|
||||
for i in old_num_buckets..num_buckets {
|
||||
let bucket_ptr = buckets_ptr.add(i as usize);
|
||||
bucket_ptr.write(core::Bucket {
|
||||
hash: 0,
|
||||
next: if i < num_buckets {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
inner.free_head
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Recalculate the dictionary
|
||||
let buckets;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i] = core::INVALID_POS;
|
||||
}
|
||||
|
||||
for i in 0..old_num_buckets as usize {
|
||||
if buckets[i].inner.is_none() {
|
||||
continue;
|
||||
}
|
||||
let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
|
||||
buckets[i].next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
// Finally, update the CoreHashMap struct
|
||||
inner.dictionary = dictionary;
|
||||
inner.buckets = buckets;
|
||||
inner.free_head = old_num_buckets;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: Shrinking is a multi-step process that requires co-operation from the caller
|
||||
//
|
||||
// 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
|
||||
// buckets.
|
||||
//
|
||||
// 2. Next, the caller must evict all entries in higher-numbered buckets.
|
||||
//
|
||||
// 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
|
||||
// shmem area
|
||||
}
|
||||
|
||||
pub struct ValueReadGuard<'a, K, V> {
|
||||
_lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
|
||||
value: *const V,
|
||||
}
|
||||
|
||||
impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
|
||||
type Target = V;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
// SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
|
||||
// by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
|
||||
// valid reference within the map.
|
||||
unsafe { &*self.value }
|
||||
}
|
||||
}
|
||||
233
libs/neon-shmem/src/hash/core.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
//! Simple hash table with chaining
|
||||
//!
|
||||
//! # Resizing
|
||||
//!
|
||||
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
// Bucket
|
||||
pub(crate) struct Bucket<K, V> {
|
||||
pub(crate) hash: u64,
|
||||
pub(crate) next: u32,
|
||||
pub(crate) inner: Option<(K, V)>,
|
||||
}
|
||||
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
pub(crate) buckets: &'a mut [Bucket<K, V>],
|
||||
pub(crate) free_head: u32,
|
||||
|
||||
// metrics
|
||||
pub(crate) buckets_in_use: u32,
|
||||
}
|
||||
|
||||
pub struct FullError();
|
||||
|
||||
impl<'a, K, V> CoreHashMap<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += size_of::<Bucket<K, V>>() * num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
|
||||
let len = area.len();
|
||||
|
||||
let mut ptr: *mut u8 = area.as_mut_ptr();
|
||||
let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
|
||||
let buckets_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
let dictionary_ptr = ptr;
|
||||
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
// Initialize the buckets
|
||||
let buckets = {
|
||||
let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
|
||||
for i in 0..buckets.len() {
|
||||
buckets[i].write(Bucket {
|
||||
hash: 0,
|
||||
next: if i < buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
|
||||
};
|
||||
|
||||
// Initialize the dictionary
|
||||
let dictionary = {
|
||||
let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
|
||||
let dictionary =
|
||||
unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
|
||||
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i].write(INVALID_POS);
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
}
|
||||
};
|
||||
|
||||
CoreHashMap {
|
||||
dictionary,
|
||||
buckets,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &K) -> Option<&V> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bucket = &self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(&bucket_value);
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let first = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = pos;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut next = first;
|
||||
loop {
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, update its value
|
||||
*bucket_value = value;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if bucket.next == INVALID_POS {
|
||||
// No existing entry found. Append to the chain
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.buckets[next as usize].next = pos;
|
||||
return Ok(());
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
let mut prev_pos: u32 = INVALID_POS;
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
// no existing entry
|
||||
return Ok(());
|
||||
}
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, unlink it from the chain
|
||||
if prev_pos == INVALID_POS {
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
|
||||
} else {
|
||||
self.buckets[prev_pos as usize].next = bucket.next;
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
bucket.hash = 0;
|
||||
bucket.inner = None;
|
||||
bucket.next = self.free_head;
|
||||
self.free_head = next;
|
||||
self.buckets_in_use -= 1;
|
||||
return Ok(());
|
||||
}
|
||||
prev_pos = next;
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.buckets.len()
|
||||
}
|
||||
|
||||
pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
|
||||
if pos >= self.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
self.buckets[pos].inner.as_ref()
|
||||
}
|
||||
|
||||
fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
|
||||
let pos = self.free_head;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
self.free_head = bucket.next;
|
||||
self.buckets_in_use += 1;
|
||||
|
||||
bucket.hash = hash;
|
||||
bucket.next = INVALID_POS;
|
||||
bucket.inner = Some((key, value));
|
||||
|
||||
return Ok(pos);
|
||||
}
|
||||
}
|
||||
220
libs/neon-shmem/src/hash/tests.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::UpdateAction;
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
|
||||
let w = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let x = w.get(&(*k).into());
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
sut: &HashMapAccess<TestKey, TestValue>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
sut.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
|
||||
writer.grow(1500).unwrap();
|
||||
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
4
libs/neon-shmem/src/lib.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
418
libs/neon-shmem/src/shmem.rs
Normal file
@@ -0,0 +1,418 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
14
libs/neonart/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
crossbeam-utils.workspace = true
|
||||
spin.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
594
libs/neonart/src/algorithm.rs
Normal file
@@ -0,0 +1,594 @@
|
||||
mod lock_and_version;
|
||||
pub(crate) mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
|
||||
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
use crate::TreeWriteGuard;
|
||||
use crate::UpdateAction;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ArtError {
|
||||
ConcurrentUpdate, // need to retry
|
||||
OutOfMemory,
|
||||
}
|
||||
|
||||
impl From<ConcurrentUpdateError> for ArtError {
|
||||
fn from(_: ConcurrentUpdateError) -> ArtError {
|
||||
ArtError::ConcurrentUpdate
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OutOfMemoryError> for ArtError {
|
||||
fn from(_: OutOfMemoryError) -> ArtError {
|
||||
ArtError::OutOfMemory
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(
|
||||
allocator: &impl ArtAllocator<V>,
|
||||
) -> Result<RootPtr<V>, OutOfMemoryError> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<&'e V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter_next<'e, V: Value>(
|
||||
key: &[u8],
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<(Vec<u8>, &'e V)> {
|
||||
loop {
|
||||
let mut path = Vec::new();
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
match next_recurse(key, &mut path, root_ref, epoch_pin) {
|
||||
Ok(Some(v)) => {
|
||||
assert_eq!(path.len(), key.len());
|
||||
break Some((path, v));
|
||||
}
|
||||
Ok(None) => break None,
|
||||
Err(ConcurrentUpdateError()) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
|
||||
match update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
None,
|
||||
guard,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(ArtError::ConcurrentUpdate) => {
|
||||
continue; // retry
|
||||
}
|
||||
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if the prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), prefix_len);
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_recurse<'e, V: Value>(
|
||||
min_key: &[u8],
|
||||
path: &mut Vec<u8>,
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.extend_from_slice(prefix);
|
||||
}
|
||||
|
||||
use std::cmp::Ordering;
|
||||
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
|
||||
if comparison == Ordering::Less {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(path.len(), min_key.len());
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let mut min_key_byte = match comparison {
|
||||
Ordering::Less => unreachable!(), // checked this above already
|
||||
Ordering::Equal => min_key[path.len()],
|
||||
Ordering::Greater => 0,
|
||||
};
|
||||
|
||||
loop {
|
||||
match rnode.find_next_child_or_restart(min_key_byte)? {
|
||||
None => {
|
||||
return Ok(None);
|
||||
}
|
||||
Some((key_byte, child_ref)) => {
|
||||
let path_len = path.len();
|
||||
path.push(key_byte);
|
||||
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
|
||||
if result.is_some() {
|
||||
return Ok(result);
|
||||
}
|
||||
if key_byte == u8::MAX {
|
||||
return Ok(None);
|
||||
}
|
||||
path.truncate(path_len);
|
||||
min_key_byte = key_byte + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
pub(crate) fn update_recurse<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> Result<(), ArtError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len as usize..];
|
||||
let level = level + prefix_match_len as usize;
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), 0);
|
||||
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value. XXX: There might be concurrent reads though?
|
||||
let value_mut = wnode.get_leaf_value_mut();
|
||||
|
||||
match value_fn(Some(value_mut)) {
|
||||
UpdateAction::Nothing => {
|
||||
wparent.write_unlock();
|
||||
wnode.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
|
||||
UpdateAction::Remove => {
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wparent.delete_child(parent_key);
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
if let Some(rgrandparent) = rgrandparent {
|
||||
// FIXME: Ignore concurrency error. It doesn't lead to
|
||||
// corruption, but it means we might leak something. Until
|
||||
// another update cleans it up.
|
||||
let _ = cleanup_parent(wparent, rgrandparent, guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_to_node(&mut wnode, key, new_value, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
wnode.write_unlock();
|
||||
}
|
||||
return Ok(());
|
||||
} else {
|
||||
let next_child = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((ref rparent, _)) = rparent {
|
||||
rparent.check_or_restart()?;
|
||||
}
|
||||
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
rparent,
|
||||
guard,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0, dst);
|
||||
}
|
||||
|
||||
// TODO: return an Err if writeln!() returns error, instead of unwrapping
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let val = unsafe { vptr.as_ref().unwrap() };
|
||||
writeln!(dst, "{} {:?}: {:?}", indent, path, val).unwrap();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for key_byte in 0..=u8::MAX {
|
||||
match rnode.find_child_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(child_ref) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
writeln!(
|
||||
dst,
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(
|
||||
&key[common_prefix_len + 1..],
|
||||
value,
|
||||
guard.tree_writer.allocator,
|
||||
)?;
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
// FIXME: deallocate 'new_value_node' on OOM
|
||||
let mut prefix_node =
|
||||
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
|
||||
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_to_node<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
wnode.insert_child(key[0], value_child.into_ptr());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
|
||||
|
||||
// FIXME: deallocate 'bigger_node' on OOM
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
bigger_node.insert_new_child(key[0], value_child);
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wparent: WriteLockedNodeRef<V>,
|
||||
rgrandparent: (ReadLockedNodeRef<V>, u8),
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let (rgrandparent, grandparent_key_byte) = rgrandparent;
|
||||
|
||||
// If the parent becomes completely empty after the deletion, remove the parent from the
|
||||
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
|
||||
// TODO: not implemented.
|
||||
|
||||
// If the parent has only one child, replace the parent with the remaining child. (This is not
|
||||
// possible if the child's prefix field cannot absorb the parent's)
|
||||
if wparent.num_children() == 1 {
|
||||
// Try to lock the remaining child. This can fail if the child is updated
|
||||
// concurrently.
|
||||
let (key_byte, remaining_child) = wparent.find_remaining_child();
|
||||
|
||||
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
|
||||
|
||||
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
|
||||
// remaining leaf. Proceed with the updates.
|
||||
|
||||
// Update the prefix on the remaining leaf
|
||||
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
|
||||
|
||||
// Replace the pointer in the grandparent to point directly to the remaining leaf
|
||||
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
|
||||
|
||||
// Mark the parent as deleted.
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the parent's children would fit on a smaller node type after the deletion, replace it with
|
||||
// a smaller node.
|
||||
if wparent.can_shrink() {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
|
||||
|
||||
// Replace the pointer in the grandparent
|
||||
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// nothing to do
|
||||
wparent.write_unlock();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If the key is long, we
|
||||
// may need to allocate new internal nodes to hold it too
|
||||
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
|
||||
|
||||
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
)?;
|
||||
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
117
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Each node in the tree has contains one atomic word that stores three things:
|
||||
//!
|
||||
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
|
||||
//! but might still be accessed by concurrent readers until the epoch expires.
|
||||
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
|
||||
//! Bits 2-63: Version number, incremented every time the node is modified.
|
||||
//!
|
||||
//! AtomicLockAndVersion represents that.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct ConcurrentUpdateError();
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
&self,
|
||||
version: u64,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
let old = self.inner.load(Ordering::Relaxed);
|
||||
if is_obsolete(old) || is_locked(old) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
old,
|
||||
set_locked_bit(old),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while is_locked(version) {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
return version + 2;
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
return (version & 1) == 1;
|
||||
}
|
||||
|
||||
fn is_locked(version: u64) -> bool {
|
||||
return (version & 2) == 2;
|
||||
}
|
||||
1102
libs/neonart/src/algorithm/node_ptr.rs
Normal file
349
libs/neonart/src/algorithm/node_ref.rs
Normal file
@@ -0,0 +1,349 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::Value;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.lockword().write_lock_or_restart()?;
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_child(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(child_ptr) => Ok(Some(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_next_child_or_restart(
|
||||
&self,
|
||||
min_key_byte: u8,
|
||||
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_next_child(min_key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some((k, child_ptr)) => Ok(Some((
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
|
||||
let result = self.ptr.get_leaf_value();
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
// Extend the lifetime.
|
||||
let result = std::ptr::from_ref(result);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn can_shrink(&self) -> bool {
|
||||
self.ptr.can_shrink()
|
||||
}
|
||||
|
||||
pub(crate) fn num_children(&self) -> usize {
|
||||
self.ptr.num_children()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
|
||||
self.ptr.prepend_prefix(prefix, prefix_byte)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
|
||||
self.ptr.get_leaf_value_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn grow<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.grow(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn shrink<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.shrink(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
|
||||
pub(crate) fn delete_child(&mut self, key_byte: u8) {
|
||||
self.ptr.delete_child(key_byte);
|
||||
}
|
||||
|
||||
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
|
||||
assert_eq!(self.num_children(), 1);
|
||||
let child_or_value = self.ptr.find_next_child(0);
|
||||
|
||||
match child_or_value {
|
||||
None => panic!("could not find only child in node"),
|
||||
Some((k, child_ptr)) => (
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
ptr: NodePtr<V>,
|
||||
allocator: &'a A,
|
||||
|
||||
extra_nodes: Vec<NodePtr<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V, A> NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
|
||||
self.ptr.insert_child(key_byte, child.as_ptr())
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
self.ptr = NodePtr::null();
|
||||
ptr
|
||||
}
|
||||
|
||||
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
|
||||
let child_ptr = child.into_ptr();
|
||||
self.ptr.insert_child(key_byte, child_ptr);
|
||||
self.extra_nodes.push(child_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.deallocate(self.allocator);
|
||||
for p in self.extra_nodes.iter() {
|
||||
p.deallocate(self.allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
158
libs/neonart/src/allocator.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
pub mod block;
|
||||
mod multislab;
|
||||
mod slab;
|
||||
pub mod r#static;
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crate::allocator::multislab::MultiSlabAllocator;
|
||||
use crate::allocator::r#static::alloc_from_slice;
|
||||
|
||||
use spin;
|
||||
|
||||
use crate::Tree;
|
||||
pub use crate::algorithm::node_ptr::{
|
||||
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub trait ArtAllocator<V: crate::Value> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V>;
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
|
||||
}
|
||||
|
||||
pub struct ArtMultiSlabAllocator<'t, V>
|
||||
where
|
||||
V: crate::Value,
|
||||
{
|
||||
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
|
||||
|
||||
pub(crate) inner: MultiSlabAllocator<'t, 5>,
|
||||
|
||||
phantom_val: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
const LAYOUTS: [Layout; 5] = [
|
||||
Layout::new::<NodeInternal4<V>>(),
|
||||
Layout::new::<NodeInternal16<V>>(),
|
||||
Layout::new::<NodeInternal48<V>>(),
|
||||
Layout::new::<NodeInternal256<V>>(),
|
||||
Layout::new::<NodeLeaf<V>>(),
|
||||
];
|
||||
|
||||
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
|
||||
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
|
||||
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
|
||||
|
||||
let allocator = allocator_area.write(ArtMultiSlabAllocator {
|
||||
tree_area: spin::Mutex::new(Some(tree_area)),
|
||||
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
|
||||
phantom_val: PhantomData,
|
||||
});
|
||||
|
||||
allocator
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V> {
|
||||
let mut t = self.tree_area.lock();
|
||||
if let Some(tree_area) = t.take() {
|
||||
return tree_area.as_mut_ptr().cast();
|
||||
}
|
||||
panic!("cannot allocate more than one tree");
|
||||
}
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
|
||||
self.inner.alloc_slab(0).cast()
|
||||
}
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
|
||||
self.inner.alloc_slab(1).cast()
|
||||
}
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
|
||||
self.inner.alloc_slab(2).cast()
|
||||
}
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
|
||||
self.inner.alloc_slab(3).cast()
|
||||
}
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
|
||||
self.inner.alloc_slab(4).cast()
|
||||
}
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
|
||||
self.inner.dealloc_slab(0, ptr.cast())
|
||||
}
|
||||
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
|
||||
self.inner.dealloc_slab(1, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
|
||||
self.inner.dealloc_slab(2, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
|
||||
self.inner.dealloc_slab(3, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
|
||||
self.inner.dealloc_slab(4, ptr.cast())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
|
||||
ArtMultiSlabStats {
|
||||
num_internal4: self.inner.slab_descs[0]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal16: self.inner.slab_descs[1]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal48: self.inner.slab_descs[2]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal256: self.inner.slab_descs[3]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_leaf: self.inner.slab_descs[4]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
|
||||
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtMultiSlabStats {
|
||||
pub num_internal4: u64,
|
||||
pub num_internal16: u64,
|
||||
pub num_internal48: u64,
|
||||
pub num_internal256: u64,
|
||||
pub num_leaf: u64,
|
||||
|
||||
pub num_blocks_internal4: u64,
|
||||
pub num_blocks_internal16: u64,
|
||||
pub num_blocks_internal48: u64,
|
||||
pub num_blocks_internal256: u64,
|
||||
pub num_blocks_leaf: u64,
|
||||
}
|
||||
191
libs/neonart/src/allocator/block.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
//! Simple allocator of fixed-size blocks
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
pub const BLOCK_SIZE: usize = 16 * 1024;
|
||||
|
||||
const INVALID_BLOCK: u64 = u64::MAX;
|
||||
|
||||
pub(crate) struct BlockAllocator<'t> {
|
||||
blocks_ptr: &'t [MaybeUninit<u8>],
|
||||
num_blocks: u64,
|
||||
num_initialized: AtomicU64,
|
||||
|
||||
freelist_head: spin::Mutex<u64>,
|
||||
}
|
||||
|
||||
struct FreeListBlock {
|
||||
inner: spin::Mutex<FreeListBlockInner>,
|
||||
}
|
||||
|
||||
struct FreeListBlockInner {
|
||||
next: u64,
|
||||
|
||||
num_free_blocks: u64,
|
||||
free_blocks: [u64; 100], // FIXME: fill the rest of the block
|
||||
}
|
||||
|
||||
impl<'t> BlockAllocator<'t> {
|
||||
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
|
||||
// Use all the space for the blocks
|
||||
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
|
||||
let remain = &mut area[padding..];
|
||||
|
||||
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
|
||||
|
||||
BlockAllocator {
|
||||
blocks_ptr: remain,
|
||||
num_blocks,
|
||||
num_initialized: AtomicU64::new(0),
|
||||
freelist_head: spin::Mutex::new(INVALID_BLOCK),
|
||||
}
|
||||
}
|
||||
|
||||
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
|
||||
/// reused for another kind of block
|
||||
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
|
||||
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
|
||||
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
|
||||
assert!(blkno < self.num_blocks);
|
||||
unsafe {
|
||||
self.blocks_ptr
|
||||
.as_ptr()
|
||||
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
|
||||
}
|
||||
.cast_mut()
|
||||
.cast()
|
||||
}
|
||||
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
|
||||
// FIXME: handle OOM
|
||||
let blkno = self.alloc_block_internal();
|
||||
if blkno == INVALID_BLOCK {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
|
||||
}
|
||||
|
||||
fn alloc_block_internal(&self) -> u64 {
|
||||
// check the free list.
|
||||
{
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
if g.num_free_blocks > 0 {
|
||||
g.num_free_blocks -= 1;
|
||||
let result = g.free_blocks[g.num_free_blocks as usize];
|
||||
return result;
|
||||
} else {
|
||||
// consume the freelist block itself
|
||||
let result = *freelist_head;
|
||||
*freelist_head = g.next;
|
||||
// This freelist block is now unlinked and can be repurposed
|
||||
drop(g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are some blocks left that we've never used, pick next such block
|
||||
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
|
||||
while next_uninitialized < self.num_blocks {
|
||||
match self.num_initialized.compare_exchange(
|
||||
next_uninitialized,
|
||||
next_uninitialized + 1,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
return next_uninitialized;
|
||||
}
|
||||
Err(old) => {
|
||||
next_uninitialized = old;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out of blocks
|
||||
return INVALID_BLOCK;
|
||||
}
|
||||
|
||||
// TODO: this is currently unused. The slab allocator never releases blocks
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
|
||||
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
|
||||
self.release_block_internal(blockno as u64);
|
||||
}
|
||||
|
||||
fn release_block_internal(&self, blockno: u64) {
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
let num_free_blocks = g.num_free_blocks;
|
||||
if num_free_blocks < g.free_blocks.len() as u64 {
|
||||
g.free_blocks[num_free_blocks as usize] = blockno;
|
||||
g.num_free_blocks += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the block into a new freelist block
|
||||
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
|
||||
let init = FreeListBlock {
|
||||
inner: spin::Mutex::new(FreeListBlockInner {
|
||||
next: *freelist_head,
|
||||
num_free_blocks: 0,
|
||||
free_blocks: [INVALID_BLOCK; 100],
|
||||
}),
|
||||
};
|
||||
unsafe { (*block_ptr) = init };
|
||||
*freelist_head = blockno;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
|
||||
let mut num_free_blocks = 0;
|
||||
|
||||
let mut _prev_lock = None;
|
||||
let head_lock = self.freelist_head.lock();
|
||||
let mut next_blk = *head_lock;
|
||||
let mut _head_lock = Some(head_lock);
|
||||
while next_blk != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(next_blk);
|
||||
let lock = freelist_block.inner.lock();
|
||||
num_free_blocks += lock.num_free_blocks;
|
||||
next_blk = lock.next;
|
||||
_prev_lock = Some(lock); // hold the lock until we've read the next block
|
||||
_head_lock = None;
|
||||
}
|
||||
|
||||
BlockAllocatorStats {
|
||||
num_blocks: self.num_blocks,
|
||||
num_initialized: self.num_initialized.load(Ordering::Relaxed),
|
||||
num_free_blocks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BlockAllocatorStats {
|
||||
pub num_blocks: u64,
|
||||
pub num_initialized: u64,
|
||||
pub num_free_blocks: u64,
|
||||
}
|
||||
33
libs/neonart/src/allocator/multislab.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::allocator::block::BlockAllocator;
|
||||
use crate::allocator::slab::SlabDesc;
|
||||
|
||||
pub struct MultiSlabAllocator<'t, const N: usize> {
|
||||
pub(crate) block_allocator: BlockAllocator<'t>,
|
||||
|
||||
pub(crate) slab_descs: [SlabDesc; N],
|
||||
}
|
||||
|
||||
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
|
||||
pub(crate) fn new(
|
||||
area: &'t mut [MaybeUninit<u8>],
|
||||
layouts: &[Layout; N],
|
||||
) -> MultiSlabAllocator<'t, N> {
|
||||
let block_allocator = BlockAllocator::new(area);
|
||||
MultiSlabAllocator {
|
||||
block_allocator,
|
||||
|
||||
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
|
||||
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
|
||||
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
|
||||
}
|
||||
}
|
||||
432
libs/neonart/src/allocator/slab.rs
Normal file
@@ -0,0 +1,432 @@
|
||||
//! A slab allocator that carves out fixed-size chunks from larger blocks.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
use super::alloc_from_slice;
|
||||
use super::block::BlockAllocator;
|
||||
|
||||
use crate::allocator::block::BLOCK_SIZE;
|
||||
|
||||
pub(crate) struct SlabDesc {
|
||||
pub(crate) layout: Layout,
|
||||
|
||||
block_lists: spin::RwLock<BlockLists>,
|
||||
|
||||
pub(crate) num_blocks: AtomicU64,
|
||||
pub(crate) num_allocated: AtomicU64,
|
||||
}
|
||||
|
||||
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
|
||||
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
|
||||
// the art tree, SlabDescs are only moved during initialization.
|
||||
unsafe impl Sync for SlabDesc {}
|
||||
unsafe impl Send for SlabDesc {}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct BlockLists {
|
||||
full_blocks: BlockList,
|
||||
nonfull_blocks: BlockList,
|
||||
}
|
||||
|
||||
impl BlockLists {
|
||||
// Unlink a node. It must be in either one of the two lists.
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
let list = unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
if self.full_blocks.tail == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else if (*elem).prev.is_null() {
|
||||
if self.full_blocks.head == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
unsafe { unlink_slab_block(list, elem) };
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().tail, elem);
|
||||
list.as_mut().unwrap().tail = (*elem).prev;
|
||||
} else {
|
||||
assert_eq!((*(*elem).next).prev, elem);
|
||||
(*(*elem).next).prev = (*elem).prev;
|
||||
}
|
||||
if (*elem).prev.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().head, elem);
|
||||
list.as_mut().unwrap().head = (*elem).next;
|
||||
} else {
|
||||
assert_eq!((*(*elem).prev).next, elem);
|
||||
(*(*elem).prev).next = (*elem).next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BlockList {
|
||||
head: *mut SlabBlockHeader,
|
||||
tail: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
impl Default for BlockList {
|
||||
fn default() -> Self {
|
||||
BlockList {
|
||||
head: std::ptr::null_mut(),
|
||||
tail: std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockList {
|
||||
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if self.is_empty() {
|
||||
self.tail = elem;
|
||||
(*elem).next = std::ptr::null_mut();
|
||||
} else {
|
||||
(*elem).next = self.head;
|
||||
(*self.head).prev = elem;
|
||||
}
|
||||
(*elem).prev = std::ptr::null_mut();
|
||||
self.head = elem;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.head.is_null()
|
||||
}
|
||||
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe { unlink_slab_block(Some(self), elem) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
let mut next = self.head;
|
||||
|
||||
while !next.is_null() {
|
||||
let n = unsafe { next.as_ref() }.unwrap();
|
||||
eprintln!(
|
||||
" blk {:?} (free {}/{})",
|
||||
next,
|
||||
n.num_free_chunks.load(Ordering::Relaxed),
|
||||
n.num_chunks
|
||||
);
|
||||
next = n.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub(crate) fn new(layout: &Layout) -> SlabDesc {
|
||||
SlabDesc {
|
||||
layout: *layout,
|
||||
block_lists: spin::RwLock::new(BlockLists::default()),
|
||||
num_allocated: AtomicU64::new(0),
|
||||
num_blocks: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex<*mut FreeChunk>,
|
||||
num_free_chunks: AtomicU32,
|
||||
num_chunks: u32, // this is really a constant for a given Layout
|
||||
|
||||
// these fields are protected by the lock on the BlockLists
|
||||
prev: *mut SlabBlockHeader,
|
||||
next: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
struct FreeChunk {
|
||||
next: *mut FreeChunk,
|
||||
}
|
||||
|
||||
enum ReadOrWriteGuard<'a, T> {
|
||||
Read(spin::RwLockReadGuard<'a, T>),
|
||||
Write(spin::RwLockWriteGuard<'a, T>),
|
||||
}
|
||||
|
||||
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
match self {
|
||||
ReadOrWriteGuard::Read(g) => g.deref(),
|
||||
ReadOrWriteGuard::Write(g) => g.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
|
||||
// Are there any free chunks?
|
||||
let mut acquire_write = false;
|
||||
'outer: loop {
|
||||
let mut block_lists_guard = if acquire_write {
|
||||
ReadOrWriteGuard::Write(self.block_lists.write())
|
||||
} else {
|
||||
ReadOrWriteGuard::Read(self.block_lists.read())
|
||||
};
|
||||
'inner: loop {
|
||||
let block_ptr = block_lists_guard.nonfull_blocks.head;
|
||||
if block_ptr.is_null() {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
if !(*free_chunks_head).is_null() {
|
||||
let result = *free_chunks_head;
|
||||
(*free_chunks_head) = (*result).next;
|
||||
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
return result.cast();
|
||||
}
|
||||
}
|
||||
|
||||
// The block at the head of the list was full. Grab write lock and retry
|
||||
match block_lists_guard {
|
||||
ReadOrWriteGuard::Read(_) => {
|
||||
acquire_write = true;
|
||||
continue 'outer;
|
||||
}
|
||||
ReadOrWriteGuard::Write(ref mut g) => {
|
||||
// move the node to the list of full blocks
|
||||
unsafe {
|
||||
g.nonfull_blocks.unlink(block_ptr);
|
||||
g.full_blocks.push_head(block_ptr);
|
||||
};
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no free chunks. Allocate a new block (and the chunk from that)
|
||||
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
|
||||
self.num_blocks.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Add the block to the list in the SlabDesc
|
||||
unsafe {
|
||||
let mut block_lists_guard = self.block_lists.write();
|
||||
block_lists_guard.nonfull_blocks.push_head(new_block);
|
||||
}
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
new_chunk
|
||||
}
|
||||
|
||||
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
|
||||
// Find the block it belongs to. You can find the block from the address. (And knowing the
|
||||
// layout, you could calculate the chunk number too.)
|
||||
let block_ptr: *mut SlabBlockHeader = {
|
||||
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
|
||||
chunk_ptr.with_addr(block_addr).cast()
|
||||
};
|
||||
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
|
||||
|
||||
// Mark the chunk as free in 'freechunks' list
|
||||
let num_chunks;
|
||||
let num_free_chunks;
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
(*chunk_ptr).next = *free_chunks_head;
|
||||
*free_chunks_head = chunk_ptr;
|
||||
|
||||
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
num_chunks = (*block_ptr).num_chunks;
|
||||
}
|
||||
|
||||
if num_free_chunks == 1 {
|
||||
// If the block was full previously, add it to the nonfull blocks list. Note that
|
||||
// we're not holding the lock anymore, so it can immediately become full again.
|
||||
// That's harmless, it will be moved back to the full list again when a call
|
||||
// to alloc_chunk() sees it.
|
||||
let mut block_lists = self.block_lists.write();
|
||||
unsafe {
|
||||
block_lists.unlink(block_ptr);
|
||||
block_lists.nonfull_blocks.push_head(block_ptr);
|
||||
};
|
||||
} else if num_free_chunks == num_chunks {
|
||||
// If the block became completely empty, move it to the free list
|
||||
// TODO
|
||||
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
|
||||
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
|
||||
//block_allocator.release_block()
|
||||
}
|
||||
|
||||
// update stats
|
||||
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn alloc_block_and_chunk(
|
||||
&self,
|
||||
block_allocator: &BlockAllocator,
|
||||
) -> (*mut SlabBlockHeader, *mut u8) {
|
||||
// fixme: handle OOM
|
||||
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
|
||||
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
|
||||
|
||||
let padding = remain.as_ptr().align_offset(self.layout.align());
|
||||
|
||||
let num_chunks = (remain.len() - padding) / self.layout.size();
|
||||
|
||||
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
|
||||
|
||||
unsafe {
|
||||
let mut chunk_ptr = first_chunk_ptr;
|
||||
for _ in 0..num_chunks - 1 {
|
||||
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
|
||||
(*chunk_ptr).next = next_chunk_ptr;
|
||||
chunk_ptr = next_chunk_ptr;
|
||||
}
|
||||
(*chunk_ptr).next = std::ptr::null_mut();
|
||||
|
||||
let result_chunk = first_chunk_ptr;
|
||||
|
||||
let block_header = block_header.write(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
num_chunks: num_chunks as u32,
|
||||
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
|
||||
});
|
||||
|
||||
(block_header, result_chunk.cast())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
eprintln!(
|
||||
"slab dump ({} blocks, {} allocated chunks)",
|
||||
self.num_blocks.load(Ordering::Relaxed),
|
||||
self.num_allocated.load(Ordering::Relaxed)
|
||||
);
|
||||
let lists = self.block_lists.read();
|
||||
|
||||
eprintln!("nonfull blocks:");
|
||||
lists.nonfull_blocks.dump();
|
||||
eprintln!("full blocks:");
|
||||
lists.full_blocks.dump();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use rand::Rng;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
struct TestObject {
|
||||
val: usize,
|
||||
_dummy: [u8; BLOCK_SIZE / 4],
|
||||
}
|
||||
|
||||
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
|
||||
impl<'a> TestObjectSlab<'a> {
|
||||
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
|
||||
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
|
||||
}
|
||||
|
||||
fn alloc(&self, val: usize) -> *mut TestObject {
|
||||
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
|
||||
unsafe { (*obj).val = val };
|
||||
obj
|
||||
}
|
||||
|
||||
fn dealloc(&self, obj: *mut TestObject) {
|
||||
self.0.dealloc_chunk(obj.cast(), &self.1)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slab_alloc() {
|
||||
const MEM_SIZE: usize = 100000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
let block_allocator = BlockAllocator::new(&mut area);
|
||||
|
||||
let slab = TestObjectSlab::new(block_allocator);
|
||||
|
||||
let mut all: Vec<*mut TestObject> = Vec::new();
|
||||
for i in 0..11 {
|
||||
all.push(slab.alloc(i));
|
||||
}
|
||||
for i in 0..11 {
|
||||
assert!(unsafe { (*all[i]).val == i });
|
||||
}
|
||||
|
||||
let distribution = Zipf::new(10 as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..100000 {
|
||||
slab.0.dump();
|
||||
let idx = (rng.sample(distribution) as usize).into();
|
||||
let ptr: *mut TestObject = all[idx];
|
||||
if !ptr.is_null() {
|
||||
assert_eq!(unsafe { (*ptr).val }, idx);
|
||||
slab.dealloc(ptr);
|
||||
all[idx] = std::ptr::null_mut();
|
||||
} else {
|
||||
all[idx] = slab.alloc(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
|
||||
Box::into_raw(Box::new(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
|
||||
num_free_chunks: AtomicU32::new(0),
|
||||
num_chunks: i,
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_linked_list() {
|
||||
// note: these are leaked, but that's OK for tests
|
||||
let a = new_test_blk(0);
|
||||
let b = new_test_blk(1);
|
||||
|
||||
let mut list = BlockList::default();
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(a);
|
||||
assert!(!list.is_empty());
|
||||
list.unlink(a);
|
||||
}
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(b);
|
||||
list.push_head(a);
|
||||
assert_eq!(list.head, a);
|
||||
assert_eq!((*a).next, b);
|
||||
assert_eq!((*b).prev, a);
|
||||
assert_eq!(list.tail, b);
|
||||
|
||||
list.unlink(a);
|
||||
list.unlink(b);
|
||||
assert!(list.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
44
libs/neonart/src/allocator/static.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn alloc_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size());
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { result_ptr.as_mut().unwrap() };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
|
||||
pub fn alloc_array_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
len: usize,
|
||||
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() * len > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size() * len);
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
147
libs/neonart/src/epoch.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_utils::CachePadded;
|
||||
use spin;
|
||||
|
||||
const NUM_SLOTS: usize = 1000;
|
||||
|
||||
/// This is the struct that is stored in shmem
|
||||
///
|
||||
/// bit 0: is it pinned or not?
|
||||
/// rest of the bits are the epoch counter.
|
||||
pub struct EpochShared {
|
||||
global_epoch: AtomicU64,
|
||||
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
|
||||
|
||||
broadcast_lock: spin::Mutex<()>,
|
||||
}
|
||||
|
||||
impl EpochShared {
|
||||
pub fn new() -> EpochShared {
|
||||
EpochShared {
|
||||
global_epoch: AtomicU64::new(2),
|
||||
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
|
||||
broadcast_lock: spin::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&self) -> LocalHandle {
|
||||
LocalHandle {
|
||||
global: self,
|
||||
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
|
||||
}
|
||||
}
|
||||
|
||||
fn release_pin(&self, slot: usize, _epoch: u64) {
|
||||
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
self.participants[slot].store(global_epoch, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
|
||||
// pick a slot
|
||||
let mut slot = slot_hint;
|
||||
let epoch = loop {
|
||||
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
|
||||
if old & 1 == 0 {
|
||||
// Got this slot
|
||||
break old;
|
||||
}
|
||||
|
||||
// the slot was busy by another thread / process. try a different slot
|
||||
slot += 1;
|
||||
if slot == NUM_SLOTS {
|
||||
slot = 0;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
(slot, epoch)
|
||||
}
|
||||
|
||||
pub(crate) fn advance(&self) -> u64 {
|
||||
// Advance the global epoch
|
||||
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
|
||||
let new_epoch = old_epoch + 2;
|
||||
|
||||
// Anyone that release their pin after this will update their slot.
|
||||
new_epoch
|
||||
}
|
||||
|
||||
pub(crate) fn broadcast(&self) {
|
||||
let Some(_guard) = self.broadcast_lock.try_lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
let old_epoch = epoch.wrapping_sub(2);
|
||||
|
||||
// Update all free slots.
|
||||
for i in 0..NUM_SLOTS {
|
||||
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
|
||||
let _ = self.participants[i].compare_exchange(
|
||||
old_epoch,
|
||||
epoch,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME: memory fence here, since we used Relaxed?
|
||||
}
|
||||
|
||||
pub(crate) fn get_oldest(&self) -> u64 {
|
||||
// Read all slots.
|
||||
let now = self.global_epoch.load(Ordering::Relaxed);
|
||||
let mut oldest = now;
|
||||
for i in 0..NUM_SLOTS {
|
||||
let this_epoch = self.participants[i].load(Ordering::Relaxed);
|
||||
let delta = now.wrapping_sub(this_epoch);
|
||||
if delta > u64::MAX / 2 {
|
||||
// this is very recent
|
||||
} else {
|
||||
if delta > now.wrapping_sub(oldest) {
|
||||
oldest = this_epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
oldest
|
||||
}
|
||||
|
||||
pub(crate) fn get_current(&self) -> u64 {
|
||||
self.global_epoch.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct EpochPin<'e> {
|
||||
slot: usize,
|
||||
pub(crate) epoch: u64,
|
||||
|
||||
handle: &'e LocalHandle<'e>,
|
||||
}
|
||||
|
||||
impl<'e> Drop for EpochPin<'e> {
|
||||
fn drop(&mut self) {
|
||||
self.handle.global.release_pin(self.slot, self.epoch);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalHandle<'g> {
|
||||
global: &'g EpochShared,
|
||||
|
||||
last_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<'g> LocalHandle<'g> {
|
||||
pub fn pin(&self) -> EpochPin {
|
||||
let (slot, epoch) = self
|
||||
.global
|
||||
.pin_internal(self.last_slot.load(Ordering::Relaxed));
|
||||
self.last_slot.store(slot, Ordering::Relaxed);
|
||||
EpochPin {
|
||||
handle: self,
|
||||
epoch,
|
||||
slot,
|
||||
}
|
||||
}
|
||||
}
|
||||
587
libs/neonart/src/lib.rs
Normal file
@@ -0,0 +1,587 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Single-value leaves.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
pub mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
use algorithm::node_ptr::NodePtr;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use allocator::ArtAllocator;
|
||||
pub use allocator::ArtMultiSlabAllocator;
|
||||
pub use allocator::OutOfMemoryError;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
// fixme obsolete, no longer needs Clone
|
||||
pub trait Value {}
|
||||
|
||||
const MAX_GARBAGE: usize = 1024;
|
||||
|
||||
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
|
||||
pub struct Tree<V: Value> {
|
||||
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
|
||||
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
|
||||
/// indirection to every lookup)
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
epoch: epoch::EpochShared,
|
||||
}
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
|
||||
unsafe impl<V: Value + Send> Send for Tree<V> {}
|
||||
|
||||
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
|
||||
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
|
||||
|
||||
impl<V> GarbageQueue<V> {
|
||||
fn new() -> GarbageQueue<V> {
|
||||
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
|
||||
self.0.push_front((ptr, epoch));
|
||||
}
|
||||
|
||||
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
|
||||
if let Some(back) = self.0.back() {
|
||||
if back.1 < cutoff_epoch {
|
||||
return Some(self.0.pop_back().unwrap().0);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
allocator: &'t A,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
pub allocator: &'t A,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
/// Obsolete nodes that cannot be recycled until their epoch expires.
|
||||
garbage: spin::Mutex<GarbageQueue<V>>,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'a, 't: 'a, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
|
||||
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
|
||||
let tree_ptr = allocator.alloc_tree();
|
||||
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
|
||||
let init = Tree {
|
||||
root: algorithm::new_root(allocator).expect("out of memory"),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
epoch: epoch::EpochShared::new(),
|
||||
};
|
||||
unsafe { tree_ptr.write(init) };
|
||||
|
||||
TreeInitStruct {
|
||||
tree: unsafe { tree_ptr.as_ref() },
|
||||
allocator,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
garbage: spin::Mutex::new(GarbageQueue::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess {
|
||||
tree: self.tree,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
|
||||
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
|
||||
where
|
||||
't: 'g,
|
||||
{
|
||||
TreeWriteGuard {
|
||||
tree_writer: self,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
created_garbage: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'e, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'e Tree<V>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
|
||||
pub fn get(&'e self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'e, K, V, A>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
created_garbage: bool,
|
||||
}
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
/// Get a value
|
||||
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
// FIXME: It's not clear if OOM is expected while removing. It seems
|
||||
// not nice, but shrinking a node can OOM. Then again, we could opt
|
||||
// to not shrink a node if we cannot allocate, to live a little longer.
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to remove value and return the old value.
|
||||
pub fn remove_and_return(self, key: &K) -> Option<V>
|
||||
where
|
||||
V: Clone,
|
||||
{
|
||||
let mut old = None;
|
||||
self.update_with_fn(key, |existing| {
|
||||
old = existing.cloned();
|
||||
UpdateAction::Remove
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
old
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
///
|
||||
/// The function is passed a reference to the existing value, if any. If the function
|
||||
/// returns None, the value is removed from the tree (or if there was no existing value,
|
||||
/// does nothing). If the function returns Some, the existing value is replaced, of if there
|
||||
/// was no existing value, it is inserted. FIXME: update comment
|
||||
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
|
||||
|
||||
if self.created_garbage {
|
||||
let _ = self.collect_garbage();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
|
||||
self.tree_writer
|
||||
.garbage
|
||||
.lock()
|
||||
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
|
||||
self.created_garbage = true;
|
||||
}
|
||||
|
||||
// returns number of nodes recycled
|
||||
fn collect_garbage(&self) -> usize {
|
||||
self.tree_writer.tree.epoch.advance();
|
||||
self.tree_writer.tree.epoch.broadcast();
|
||||
|
||||
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
|
||||
|
||||
let mut result = 0;
|
||||
let mut garbage_queue = self.tree_writer.garbage.lock();
|
||||
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
|
||||
ptr.deallocate(self.tree_writer.allocator);
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
done: bool,
|
||||
pub next_key: Vec<u8>,
|
||||
max_key: Option<Vec<u8>>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<K> TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
pub fn new_wrapping() -> TreeIterator<K> {
|
||||
let mut next_key = Vec::new();
|
||||
next_key.resize(K::KEY_LEN, 0);
|
||||
TreeIterator {
|
||||
done: false,
|
||||
next_key,
|
||||
max_key: None,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
|
||||
let result = TreeIterator {
|
||||
done: false,
|
||||
next_key: Vec::from(range.start.as_bytes()),
|
||||
max_key: Some(Vec::from(range.end.as_bytes())),
|
||||
phantom_key: PhantomData,
|
||||
};
|
||||
assert_eq!(result.next_key.len(), K::KEY_LEN);
|
||||
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
|
||||
where
|
||||
V: Value,
|
||||
{
|
||||
if self.done {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut wrapped_around = false;
|
||||
loop {
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
if let Some((k, v)) = algorithm::iter_next(
|
||||
&mut self.next_key,
|
||||
read_guard.tree.root,
|
||||
&read_guard.epoch_pin,
|
||||
) {
|
||||
assert_eq!(k.len(), K::KEY_LEN);
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
|
||||
// Check if we reached the end of the range
|
||||
if let Some(max_key) = &self.max_key {
|
||||
if k.as_slice() >= max_key.as_slice() {
|
||||
self.done = true;
|
||||
break None;
|
||||
}
|
||||
}
|
||||
|
||||
// increment the key
|
||||
self.next_key = k.clone();
|
||||
increment_key(self.next_key.as_mut_slice());
|
||||
let k = k.as_slice().into();
|
||||
|
||||
break Some((k, v));
|
||||
} else {
|
||||
if self.max_key.is_some() {
|
||||
self.done = true;
|
||||
} else {
|
||||
// Start from beginning
|
||||
if !wrapped_around {
|
||||
for i in 0..K::KEY_LEN {
|
||||
self.next_key[i] = 0;
|
||||
}
|
||||
wrapped_around = true;
|
||||
continue;
|
||||
} else {
|
||||
// The tree is completely empty
|
||||
// FIXME: perhaps we should remember the starting point instead.
|
||||
// Currently this will scan some ranges twice.
|
||||
break None;
|
||||
}
|
||||
}
|
||||
break None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_key(key: &mut [u8]) -> bool {
|
||||
for i in (0..key.len()).rev() {
|
||||
let (byte, overflow) = key[i].overflowing_add(1);
|
||||
key[i] = byte;
|
||||
if !overflow {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// Debugging functions
|
||||
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
|
||||
pub fn get_statistics(&self) -> ArtTreeStatistics {
|
||||
self.allocator.get_statistics();
|
||||
ArtTreeStatistics {
|
||||
blocks: self.allocator.inner.block_allocator.get_statistics(),
|
||||
slabs: self.allocator.get_statistics(),
|
||||
epoch: self.tree.epoch.get_current(),
|
||||
oldest_epoch: self.tree.epoch.get_oldest(),
|
||||
num_garbage: self.garbage.lock().0.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtTreeStatistics {
|
||||
pub blocks: allocator::block::BlockAllocatorStats,
|
||||
pub slabs: allocator::ArtMultiSlabStats,
|
||||
|
||||
pub epoch: u64,
|
||||
pub oldest_epoch: u64,
|
||||
pub num_garbage: u64,
|
||||
}
|
||||