Compare commits

..

62 Commits

Author SHA1 Message Date
Konstantin Knizhnik
9ea17556b3 Fix calculation of logical delta layer space 2023-05-19 11:03:29 +03:00
Konstantin Knizhnik
a5db639a2f More precisely calculate logical image size 2023-05-18 17:58:15 +03:00
Konstantin Knizhnik
64aa8e5c6b Fi unit tests 2023-05-18 00:07:44 +03:00
Konstantin Knizhnik
6a1f2c8b71 Use last_record_lsn to collect keyspace in GC 2023-05-17 22:57:09 +03:00
Konstantin Knizhnik
a10ba532dd Reduce write amplification for wanted image layers 2023-05-17 14:19:56 +03:00
Konstantin Knizhnik
f783596825 Add missed assert to keyspace_add_range test 2023-05-16 21:22:21 +03:00
Konstantin Knizhnik
4b2b175db8 Replace env.poostgres with env.endpoints 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3902868e68 Replace env.postgres with env.endpoint 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
1ffca3eadb Undo merge problem 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
b499ade206 Remove contains method from LayerMap API 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
0cd01e33c3 Add LayerMap::contains method 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
95dd5c71bf Prohibit insertion fo dulicated layers in layer map 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
c2731e17a9 Prevent duplicated layer in LayerMap 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
a7cf926aeb Reduce live time of wanted image layer lock 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
ffdf7df2ea Update pageserver/src/tenant/timeline.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
74ab232afb Update pageserver/src/tenant/timeline.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
2cf02b381c Update pageserver/src/keyspace.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
6d687c198b Fix pythin style warnings 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
755166e275 Simplified version of test_gc_feedback 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
7d4ebf8485 Update pageserver/src/keyspace.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
9b9b125d13 Make clippy happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
4d76c2916e Apply black to test_gc_feedback 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
199771371c Make ruff happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
43187715d6 Add KeySpaceRandomAccum 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
1e63fc99db Move test_gc_feedback test to performance 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
7b58f82f7b Move test_gc_feedback test to performance 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
2af45505b8 test_runner/performance/test_gc_feedback.py 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3c5b99b4b9 Rename test_gc_feedback test 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
8b05a87f75 Add test that no redundant image are generatd if them are wanted by GC 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3bc4a7c1e2 Add test that no redundant image are generatd if them are wanted by GC 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3d0a51567f Fix KeySpace.add_range 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
7e6dbc32d1 Update pageserver/src/tenant/timeline.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
f12f6b0275 Fix pythin style 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
0fbd85f64b Fix python style violations in test_gc_old_layers.py 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
af75d59b4c Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
4618739cb3 Update test_runner/regress/test_gc_old_layers.py
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
f838a11514 Update pageserver/src/keyspace.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
f0fe03ea80 Make clippy happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
be22be7b24 Make clippy happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
451479305e Use KeySpace for passing infirmation about wanted image layers from GC to copaction task 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
5e690307fb Avoid redundant generation of wanted image layers if such layer already exists beyond GC cutoff horizon 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3e6288d7d8 Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
2d015a1464 Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
0785d92577 Add isort happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
fcb9bac847 Revert changes in key space partitioning 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
7a3d6531b8 Revert "fix KeySpace initialization in bench_layer_map.rs"
This reverts commit 63b1fcb813ca5f40a2b1328d4cb6e21646fba69f.
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
3275305a30 Revert "Split keyspace in partitions without holes"
This reverts commit 02c0e9082f804ccf201fe1cf07eb167b697ea9a3.
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
0deca452bf Add comments 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
98de2a6d93 Update test_runner/regress/test_gc_old_layers.py
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
88257b91d7 Update test_runner/regress/test_gc_old_layers.py
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
e8066631a6 Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
e069c409ef Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
7f81d57d52 Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
787c4a8bbb Update pageserver/src/tenant/timeline.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
6ec9922184 Make clippy happy 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
9b241f29cd Remove sleep at the end of test_gc_old_layers.py 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
9b418a71ac fix KeySpace initialization in bench_layer_map.rs 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
1bb8ca0806 Split keyspace in partitions without holes 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
a1c8e74fb9 Add test for GC of stairs layers 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
f9999c84d9 Rebase with main 2023-05-16 21:18:20 +03:00
Konstantin Knizhnik
c01c31d045 Add comment exlaining wanted_image_layers 2023-05-16 21:18:19 +03:00
Konstantin Knizhnik
4da24ba34f Pass set of wanted image layers from GC to compaction 2023-05-16 21:18:19 +03:00
172 changed files with 3976 additions and 9546 deletions

View File

@@ -14,4 +14,3 @@ opt-level = 1
[alias]
build_testing = ["build", "--features", "testing"]
neon = ["run", "--bin", "neon_local"]

View File

@@ -57,14 +57,14 @@ runs:
if ! which allure; then
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
unzip -q ${ALLURE_ZIP}
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.22.1
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
ALLURE_VERSION: 2.22.0
ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock
@@ -147,8 +147,6 @@ runs:
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
- name: Release lock
if: always()
shell: bash -euxo pipefail {0}

View File

@@ -36,14 +36,18 @@ inputs:
description: 'Region name for real s3 tests'
required: false
default: ''
real_s3_access_key_id:
description: 'Access key id'
required: false
default: ''
real_s3_secret_access_key:
description: 'Secret access key'
required: false
default: ''
rerun_flaky:
description: 'Whether to rerun flaky tests'
required: false
default: 'false'
pg_version:
description: 'Postgres version to use for tests'
required: false
default: 'v14'
runs:
using: "composite"
@@ -63,12 +67,12 @@ runs:
path: /tmp/neon-previous
prefix: latest
- name: Download compatibility snapshot
- name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Checkout
@@ -96,18 +100,19 @@ runs:
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
PG_VERSION: ${{ inputs.pg_version }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
# and it is needed to distinguish different environments
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
export DEFAULT_PG_VERSION=${PG_VERSION#v}
export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
if [ "${BUILD_TYPE}" = "remote" ]; then
export REMOTE_ENV=1
@@ -187,13 +192,13 @@ runs:
scripts/generate_and_push_perf_report.sh
fi
- name: Upload compatibility snapshot
- name: Upload compatibility snapshot for Postgres 14
if: github.ref_name == 'release'
uses: ./.github/actions/upload
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
path: /tmp/test_output/compatibility_snapshot_pg14/
prefix: latest
- name: Upload test results

View File

@@ -1,6 +1,6 @@
## Problem
## Describe your changes
## Summary of changes
## Issue ticket number and link
## Checklist before requesting a review

View File

@@ -16,12 +16,12 @@ on:
workflow_dispatch: # adds ability to run this manually
inputs:
region_id:
description: 'Project region id. If not set, the default region will be used'
description: 'Use a particular region. If not set the default region will be used'
required: false
default: 'aws-us-east-2'
save_perf_report:
type: boolean
description: 'Publish perf report. If not set, the report will be published only for the main branch'
description: 'Publish perf report or not. If not set, the report is published only for the main branch'
required: false
defaults:
@@ -125,14 +125,13 @@ jobs:
matrix='{
"platform": [
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
"neon-captest-reuse"
],
"db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
"include": [
{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" }
]
}'
if [ "$(date +%A)" = "Saturday" ]; then
@@ -198,7 +197,7 @@ jobs:
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
@@ -206,7 +205,6 @@ jobs:
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
id: set-up-connstr
@@ -215,7 +213,7 @@ jobs:
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
neon-captest-new | neon-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -225,7 +223,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac

View File

@@ -324,8 +324,7 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
options: --init
needs: [ build-neon ]
strategy:
fail-fast: false
@@ -346,11 +345,13 @@ jobs:
test_selection: regress
needs_postgres_source: true
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
real_s3_region: eu-central-1
real_s3_bucket: ci-tests-s3
real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env:
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
@@ -362,8 +363,7 @@ jobs:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
options: --init
needs: [ build-neon ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
@@ -407,7 +407,9 @@ jobs:
uses: ./.github/actions/allure-report-generate
- uses: actions/github-script@v6
if: ${{ !cancelled() }}
if: >
!cancelled() &&
github.event_name == 'pull_request'
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -417,7 +419,7 @@ jobs:
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
}
const script = require("./scripts/comment-test-report.js")
const script = require("./scripts/pr-comment-test-report.js")
await script({
github,
context,
@@ -488,48 +490,37 @@ jobs:
- name: Merge coverage data
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
- name: Build coverage report
env:
COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
- name: Build and upload coverage report
run: |
scripts/coverage --dir=/tmp/coverage \
report \
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
scripts/coverage \
--dir=/tmp/coverage report \
--input-objects=/tmp/coverage/binaries.list \
--commit-url=${COMMIT_URL} \
--commit-url=$COMMIT_URL \
--format=github
scripts/coverage --dir=/tmp/coverage \
report \
--input-objects=/tmp/coverage/binaries.list \
--format=lcov
REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
- name: Upload coverage report
id: upload-coverage-report
env:
BUCKET: neon-github-public-dev
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
scripts/git-upload \
--repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
--message="Add code coverage for $COMMIT_URL" \
copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL, COMMIT_SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${COMMIT_SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: 'Code coverage report',
})
# Add link to the coverage report to the commit
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"success\",
\"context\": \"neon-coverage\",
\"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
}"
trigger-e2e-tests:
runs-on: [ self-hosted, gen3, small ]
@@ -664,9 +655,6 @@ jobs:
project: nrdv0s4kcs
push: true
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
build-args: |
GIT_VERSION=${{ github.sha }}
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
@@ -715,11 +703,7 @@ jobs:
compute-node-image:
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
options: --add-host=download.osgeo.org:140.211.15.30
container: gcr.io/kaniko-project/executor:v1.9.2-debug
needs: [ tag ]
strategy:
fail-fast: false
@@ -781,7 +765,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.8.0
VM_BUILDER_VERSION: v0.4.6
steps:
- name: Checkout
@@ -791,18 +775,21 @@ jobs:
- name: Downloading vm-builder
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image
run: |
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Building VM compute-node rootfs
run: |
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
- name: Build vm image
run: |
./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
@@ -962,7 +949,7 @@ jobs:
promote-compatibility-data:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ promote-images, tag, regress-tests ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
@@ -973,13 +960,11 @@ jobs:
PREFIX: artifacts/latest
run: |
# Update compatibility snapshot for the release
for pg_version in v14 v15; do
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
# Update Neon artifact for the release (reuse already uploaded artifact)

View File

@@ -2,7 +2,7 @@
Howdy! Usual good software engineering practices apply. Write
tests. Write comments. Follow standard Rust coding practices where
possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting.
possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
There are soft spots in the code, which could use cleanup,
refactoring, additional comments, and so forth. Let's try to raise the

889
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,26 +3,12 @@ members = [
"compute_tools",
"control_plane",
"pageserver",
"pageserver/ctl",
"proxy",
"safekeeper",
"storage_broker",
"workspace_hack",
"trace",
"libs/compute_api",
"libs/pageserver_api",
"libs/postgres_ffi",
"libs/safekeeper_api",
"libs/utils",
"libs/consumption_metrics",
"libs/postgres_backend",
"libs/pq_proto",
"libs/tenant_size_model",
"libs/metrics",
"libs/postgres_connection",
"libs/remote_storage",
"libs/tracing-utils",
"libs/postgres_ffi/wal_craft",
"libs/*",
]
[workspace.package]
@@ -35,10 +21,9 @@ anyhow = { version = "1.0", features = ["backtrace"] }
async-stream = "0.3"
async-trait = "0.1"
atty = "0.2.14"
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.27"
aws-smithy-http = "0.55"
aws-credential-types = "0.55"
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.21.0"
aws-smithy-http = "0.51.0"
aws-types = "0.55"
base64 = "0.13.0"
bincode = "1.3"
@@ -140,11 +125,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries
@@ -180,7 +165,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
# Changes the MAX_THREADS limit from 4096 to 32768.
# This is a temporary workaround for using tracing from many threads in safekeepers code,

View File

@@ -47,7 +47,8 @@ RUN set -e \
&& mold -run cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
--bin pageserver_binutils \
--bin draw_timeline_dir \
--bin safekeeper \
--bin storage_broker \
--bin proxy \
@@ -72,7 +73,8 @@ RUN set -e \
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin

View File

@@ -415,23 +415,6 @@ RUN apt-get update && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
#########################################################################################
#
# Layer "pg-cron-pg-build"
# compile pg_cron extension
#
#########################################################################################
FROM build-deps AS pg-cron-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
#########################################################################################
#
# Layer "rust extensions"
@@ -517,22 +500,6 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
#
# Layer "pg-pgx-ulid-build"
# Compile "pgx_ulid" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-pgx-ulid-build
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -562,8 +529,6 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -573,10 +538,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
@@ -653,7 +614,6 @@ RUN apt update && \
libxml2 \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
procps && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -0,0 +1,70 @@
# Note: this file *mostly* just builds on Dockerfile.compute-node
ARG SRC_IMAGE
ARG VM_INFORMANT_VERSION=v0.1.14
# on libcgroup update, make sure to check bootstrap.sh for changes
ARG LIBCGROUP_VERSION=v2.0.3
# Pull VM informant, to copy from later
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
# Build cgroup-tools
#
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
# requires cgroup v2, so we'll build cgroup-tools ourselves.
FROM debian:bullseye-slim as libcgroup-builder
ARG LIBCGROUP_VERSION
RUN set -exu \
&& apt update \
&& apt install --no-install-recommends -y \
git \
ca-certificates \
automake \
cmake \
make \
gcc \
byacc \
flex \
libtool \
libpam0g-dev \
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
&& INSTALL_DIR="/libcgroup-install" \
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
&& cd libcgroup \
# extracted from bootstrap.sh, with modified flags:
&& (test -d m4 || mkdir m4) \
&& autoreconf -fi \
&& rm -rf autom4te.cache \
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
# actually build the thing...
&& make install
# Combine, starting from non-VM compute node image.
FROM $SRC_IMAGE as base
# Temporarily set user back to root so we can run adduser, set inittab
USER root
RUN adduser vm-informant --disabled-password --no-create-home
RUN set -e \
&& rm -f /etc/inittab \
&& touch /etc/inittab
RUN set -e \
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
USER postgres
ADD vm-cgconfig.conf /etc/cgconfig.conf
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]

View File

@@ -138,11 +138,6 @@ neon-pg-ext-%: postgres-%
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
@@ -158,9 +153,6 @@ neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
.PHONY: neon-pg-ext
neon-pg-ext: \

View File

@@ -17,7 +17,7 @@ The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
## Running local installation
@@ -28,19 +28,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel
protobuf-devel
```
* On Arch based systems, these packages are needed:
```bash
pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf curl
postgresql-libs cmake postgresql protobuf
```
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -131,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
```sh
# Create repository in .neon with proper paths to binaries and data
# Later that would be responsibility of a package install script
> cargo neon init
> ./target/debug/neon_local init
Starting pageserver at '127.0.0.1:64000' in '.neon'.
# start pageserver, safekeeper, and broker for their intercommunication
> cargo neon start
> ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051
storage_broker started, pid: 2918372
Starting pageserver at '127.0.0.1:64000' in '.neon'.
@@ -144,19 +143,19 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2918437
# create initial tenant and use it as a default for every future neon_local invocation
> cargo neon tenant create --set-default
> ./target/debug/neon_local tenant create --set-default
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# start postgres compute node
> cargo neon endpoint start main
> ./target/debug/neon_local endpoint start main
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
# check list of running postgres instances
> cargo neon endpoint list
> ./target/debug/neon_local endpoint list
ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
```
@@ -178,22 +177,22 @@ postgres=# select * from t;
3. And create branches and run postgres on them:
```sh
# create branch named migration_check
> cargo neon timeline branch --branch-name migration_check
> ./target/debug/neon_local timeline branch --branch-name migration_check
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
# check branches tree
> cargo neon timeline list
> ./target/debug/neon_local timeline list
(L) main [de200bd42b49cc1814412c7e592dd6e9]
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
# start postgres on that branch
> cargo neon endpoint start migration_check --branch-name migration_check
> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
# check the new list of running postgres instances
> cargo neon endpoint list
> ./target/debug/neon_local endpoint list
ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
@@ -222,7 +221,7 @@ postgres=# select * from t;
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
you have just started. You can terminate them all with one command:
```sh
> cargo neon stop
> ./target/debug/neon_local stop
```
## Running tests
@@ -239,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.
[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
@@ -266,6 +265,6 @@ To get more familiar with this aspect, refer to:
## Join the development
- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
- Read `CONTRIBUTING.md` to learn about project code style and practices.
- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md).
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html

View File

@@ -59,9 +59,6 @@ fn main() -> Result<()> {
let matches = cli().get_matches();
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
@@ -181,8 +178,7 @@ fn main() -> Result<()> {
// Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress.
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
if !spec_set {
// No spec provided, hang waiting for it.
@@ -290,14 +286,6 @@ fn cli() -> clap::Command {
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
clap::Command::new("compute_ctl")
.version(version)
.arg(
Arg::new("http-port")
.long("http-port")
.value_name("HTTP_PORT")
.default_value("3080")
.value_parser(clap::value_parser!(u16))
.required(false),
)
.arg(
Arg::new("connstr")
.short('C')

View File

@@ -1,3 +1,19 @@
//
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
// but there are several things that makes `PostgresNode` usage inconvenient in the
// cloud:
// - it inherits from `LocalEnv`, which contains **all-all** the information about
// a complete service running
// - it uses `PageServerNode` with information about http endpoint, which we do not
// need in the cloud again
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
//
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
// attributes (not required for the cloud). Yet, it is still tempting to unify these
// `PostgresNode` and `ComputeNode` and use one in both places.
//
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
//
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
@@ -90,38 +106,26 @@ pub struct ParsedSpec {
impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
// Extract the options from the spec file that are needed to connect to
// the storage system.
//
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec
.pageserver_connstring
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.cluster
.settings
.find("neon.pageserver_connstring")
.ok_or("pageserver connstr should be provided")?;
let storage_auth_token = spec.storage_auth_token.clone();
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
} else {
spec.cluster
.settings
.find("neon.tenant_id")
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?
};
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
timeline_id
} else {
spec.cluster
.settings
.find("neon.timeline_id")
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?
};
let tenant_id: TenantId = spec
.cluster
.settings
.find("neon.tenant_id")
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?;
let timeline_id: TimelineId = spec
.cluster
.settings
.find("neon.timeline_id")
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?;
Ok(ParsedSpec {
spec,
@@ -291,8 +295,8 @@ impl ComputeNode {
update_pg_hba(pgdata_path)?;
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Replica | ComputeMode::Static(..) => {
ComputeMode::Primary | ComputeMode::Static(..) => {}
ComputeMode::Replica => {
add_standby_signal(pgdata_path)?;
}
}
@@ -358,8 +362,6 @@ impl ComputeNode {
};
// Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases.
client.simple_query("SET neon.forward_ddl = false")?;
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client)?;
@@ -372,7 +374,7 @@ impl ComputeNode {
info!(
"finished configuration of compute for project {}",
spec.cluster.cluster_id.as_deref().unwrap_or("None")
spec.cluster.cluster_id
);
Ok(())
@@ -401,9 +403,7 @@ impl ComputeNode {
self.pg_reload_conf(&mut client)?;
// Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases.
if spec.mode == ComputeMode::Primary {
client.simple_query("SET neon.forward_ddl = false")?;
handle_roles(&spec, &mut client)?;
handle_databases(&spec, &mut client)?;
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
@@ -430,7 +430,7 @@ impl ComputeNode {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
spec.spec.cluster.cluster_id,
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
spec.tenant_id,
spec.timeline_id,

View File

@@ -5,7 +5,6 @@ use std::path::Path;
use anyhow::Result;
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec};
@@ -37,44 +36,10 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
// Write the postgresql.conf content from the spec file as is.
if let Some(conf) = &spec.cluster.postgresql_conf {
writeln!(file, "{}", conf)?;
}
writeln!(file, "# Managed by compute_ctl: begin")?;
write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
if let Some(s) = &spec.pageserver_connstring {
writeln!(
file,
"neon.pageserver_connstring='{}'",
escape_conf_value(s)
)?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,
"neon.safekeepers='{}'",
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {
writeln!(
file,
"neon.tenant_id='{}'",
escape_conf_value(&s.to_string())
)?;
}
if let Some(s) = &spec.timeline_id {
writeln!(
file,
"neon.timeline_id='{}'",
escape_conf_value(&s.to_string())
)?;
}
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {
@@ -88,12 +53,7 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
}
}
// If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?;
write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
writeln!(file, "# Managed by compute_ctl: end")?;
}
writeln!(file, "# Managed by compute_ctl: end")?;
Ok(())
}

View File

@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {
let addr = SocketAddr::from(([0, 0, 0, 0], port));
async fn serve(state: Arc<ComputeNode>) {
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
let make_service = make_service_fn(move |_conn| {
let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
}
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-endpoint".into())
.spawn(move || serve(port, state))?)
.spawn(move || serve(state))?)
}

View File

@@ -33,7 +33,5 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
.init();
tracing::info!("logging and tracing started");
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
Ok(())
}

View File

@@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String {
/// Escape a string so that it can be used in postgresql.conf.
/// Same as escape_literal, currently.
pub fn escape_conf_value(s: &str) -> String {
fn escape_conf_value(s: &str) -> String {
s.replace('\'', "''").replace('\\', "\\\\")
}
@@ -121,8 +121,9 @@ impl RoleExt for Role {
/// string of arguments.
fn to_pg_options(&self) -> String {
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
let mut params: String = self.options.as_pg_options();
params.push_str(" LOGIN");
// For now, we do not use generic `options` for roles. Once used, add
// `self.options.as_pg_options()` somewhere here.
let mut params: String = "LOGIN".to_string();
if let Some(pass) = &self.encrypted_password {
// Some time ago we supported only md5 and treated all encrypted_password as md5.

View File

@@ -62,7 +62,7 @@ fn do_control_plane_request(
}
}
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
/// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane(
base_uri: &str,

View File

@@ -16,7 +16,7 @@ mod pg_helpers_tests {
);
assert_eq!(
spec.cluster.roles.first().unwrap().to_pg_options(),
" LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
"LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
);
}

View File

@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PG_VERSION: &str = "14";
fn default_conf() -> String {
format!(
@@ -476,11 +476,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
println!("Creating endpoint for imported timeline ...");
cplane.new_endpoint(
name,
tenant_id,
name,
timeline_id,
None,
None,
pg_version,
ComputeMode::Primary,
)?;
@@ -592,7 +591,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
table.add_row([
endpoint_id.as_str(),
&endpoint.pg_address.to_string(),
&endpoint.address.to_string(),
&endpoint.timeline_id.to_string(),
branch_name,
lsn_str.as_str(),
@@ -621,8 +620,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let pg_version = sub_args
.get_one::<u32>("pg-version")
.copied()
@@ -640,38 +639,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
cplane.new_endpoint(
&endpoint_id,
tenant_id,
timeline_id,
pg_port,
http_port,
pg_version,
mode,
)?;
cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
}
"start" => {
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let endpoint_id = sub_args
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
let mut safekeepers: Vec<NodeId> = Vec::new();
for sk_id in safekeepers_str.split(',').map(str::trim) {
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
})?);
safekeepers.push(sk_id);
}
safekeepers
} else {
env.safekeepers.iter().map(|sk| sk.id).collect()
};
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -698,7 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers)?;
endpoint.start(&auth_token)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
@@ -734,15 +709,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
let ep = cplane.new_endpoint(
endpoint_id,
tenant_id,
endpoint_id,
timeline_id,
pg_port,
http_port,
port,
pg_version,
mode,
)?;
ep.start(&auth_token, safekeepers)?;
ep.start(&auth_token)?;
}
}
"stop" => {
@@ -970,22 +944,11 @@ fn cli() -> Command {
.value_parser(value_parser!(u32))
.default_value(DEFAULT_PG_VERSION);
let pg_port_arg = Arg::new("pg-port")
.long("pg-port")
let port_arg = Arg::new("port")
.long("port")
.required(false)
.value_parser(value_parser!(u16))
.value_name("pg-port");
let http_port_arg = Arg::new("http-port")
.long("http-port")
.required(false)
.value_parser(value_parser!(u16))
.value_name("http-port");
let safekeepers_arg = Arg::new("safekeepers")
.long("safekeepers")
.required(false)
.value_name("safekeepers");
.value_name("port");
let stop_mode_arg = Arg::new("stop-mode")
.short('m')
@@ -1130,8 +1093,7 @@ fn cli() -> Command {
.arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone())
.arg(lsn_arg.clone())
.arg(pg_port_arg.clone())
.arg(http_port_arg.clone())
.arg(port_arg.clone())
.arg(
Arg::new("config-only")
.help("Don't do basebackup, create endpoint directory with only config files")
@@ -1147,11 +1109,9 @@ fn cli() -> Command {
.arg(branch_name_arg)
.arg(timeline_id_arg)
.arg(lsn_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
.arg(port_arg)
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
)
.subcommand(
Command::new("stop")

View File

@@ -1,9 +1,3 @@
//! Code to manage the storage broker
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! .neon/safekeepers/<safekeeper id>
//!
use anyhow::Context;
use std::path::PathBuf;

View File

@@ -1,71 +1,40 @@
//! Code to manage compute endpoints
//!
//! In the local test environment, the data for each endpoint is stored in
//!
//! .neon/endpoints/<endpoint id>
//!
//! Some basic information about the endpoint, like the tenant and timeline IDs,
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
//! when the endpoint is created, and doesn't change afterwards.
//!
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
//! the basebackup from the pageserver to initialize the the data directory, and
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
//! until it exits.
//!
//! When an endpoint is created, a `postgresql.conf` file is also created in
//! the endpoint's directory. The file can be modified before starting PostgreSQL.
//! However, the `postgresql.conf` file in the endpoint directory is not used directly
//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
//! copy of it in the data directory.
//!
//! Directory contents:
//!
//! ```ignore
//! .neon/endpoints/main/
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
//! postgresql.conf - postgresql settings
//! spec.json - passed to `compute_ctl`
//! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal
//! <other PostgreSQL files>
//! ```
//!
use std::collections::BTreeMap;
use std::fs::{self, File};
use std::io::Write;
use std::net::SocketAddr;
use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::process::Command;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::local_env::LocalEnv;
use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf;
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
use compute_api::spec::ComputeMode;
// contents of a endpoint.json file
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf {
endpoint_id: String,
name: String,
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
timeline_id: TimelineId,
mode: ComputeMode,
pg_port: u16,
http_port: u16,
port: u16,
pg_version: u32,
}
@@ -88,11 +57,11 @@ impl ComputeControlPlane {
let pageserver = Arc::new(PageServerNode::from_env(&env));
let mut endpoints = BTreeMap::default();
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
for endpoint_dir in fs::read_dir(env.endpoints_path())
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
{
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
endpoints.insert(ep.name.clone(), Arc::new(ep));
}
Ok(ComputeControlPlane {
@@ -107,28 +76,25 @@ impl ComputeControlPlane {
1 + self
.endpoints
.values()
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
.map(|ep| ep.address.port())
.max()
.unwrap_or(self.base_port)
}
#[allow(clippy::too_many_arguments)]
pub fn new_endpoint(
&mut self,
endpoint_id: &str,
tenant_id: TenantId,
name: &str,
timeline_id: TimelineId,
pg_port: Option<u16>,
http_port: Option<u16>,
port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let port = port.unwrap_or_else(|| self.get_port());
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
name: name.to_owned(),
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
timeline_id,
@@ -136,27 +102,21 @@ impl ComputeControlPlane {
tenant_id,
pg_version,
});
ep.create_endpoint_dir()?;
ep.create_pgdata()?;
std::fs::write(
ep.endpoint_path().join("endpoint.json"),
serde_json::to_string_pretty(&EndpointConf {
endpoint_id: endpoint_id.to_string(),
name: name.to_string(),
tenant_id,
timeline_id,
mode,
http_port,
pg_port,
port,
pg_version,
})?,
)?;
std::fs::write(
ep.endpoint_path().join("postgresql.conf"),
ep.setup_pg_conf()?.to_string(),
)?;
ep.setup_pg_conf()?;
self.endpoints
.insert(ep.endpoint_id.clone(), Arc::clone(&ep));
self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
Ok(ep)
}
@@ -167,16 +127,13 @@ impl ComputeControlPlane {
#[derive(Debug)]
pub struct Endpoint {
/// used as the directory name
endpoint_id: String,
name: String,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub mode: ComputeMode,
// port and address of the Postgres server and `compute_ctl`'s HTTP API
pub pg_address: SocketAddr,
pub http_address: SocketAddr,
// postgres major version in the format: 14, 15, etc.
// port and address of the Postgres server
pub address: SocketAddr,
pg_version: u32,
// These are not part of the endpoint as such, but the environment
@@ -200,16 +157,16 @@ impl Endpoint {
// parse data directory name
let fname = entry.file_name();
let endpoint_id = fname.to_str().unwrap().to_string();
let name = fname.to_str().unwrap().to_string();
// Read the endpoint.json file
let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
// ok now
Ok(Endpoint {
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
endpoint_id,
address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
name,
env: env.clone(),
pageserver: Arc::clone(pageserver),
timeline_id: conf.timeline_id,
@@ -219,17 +176,104 @@ impl Endpoint {
})
}
fn create_endpoint_dir(&self) -> Result<()> {
std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
format!(
"could not create endpoint directory {}",
self.endpoint_path().display()
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
let mut cmd = Command::new(pg_path);
cmd.arg("--sync-safekeepers")
.env_clear()
.env(
"LD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
)
})
.env(
"DYLD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
)
.env("PGDATA", self.pgdata().to_str().unwrap())
.stdout(Stdio::piped())
// Comment this to avoid capturing stderr (useful if command hangs)
.stderr(Stdio::piped());
if let Some(token) = auth_token {
cmd.env("NEON_AUTH_TOKEN", token);
}
let sync_handle = cmd
.spawn()
.expect("postgres --sync-safekeepers failed to start");
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"sync-safekeepers failed: '{}'",
String::from_utf8_lossy(&sync_output.stderr)
);
}
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
println!("Safekeepers synced on {}", lsn);
Ok(lsn)
}
// Generate postgresql.conf with default configuration
fn setup_pg_conf(&self) -> Result<PostgresConf> {
/// Get basebackup from the pageserver as a tar archive and extract it
/// to the `self.pgdata()` directory.
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
println!(
"Extracting base backup to create postgres instance: path={} port={}",
self.pgdata().display(),
self.address.port()
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
} else {
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
};
let mut client = self
.pageserver
.page_server_psql_client()
.context("connecting to page server failed")?;
let copyreader = client
.copy_out(sql.as_str())
.context("page server 'basebackup' command failed")?;
// Read the archive directly from the `CopyOutReader`
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
let mut ar = tar::Archive::new(copyreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata())
.context("extracting base backup failed")?;
Ok(())
}
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(self.pgdata()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
)
})?;
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
.with_context(|| {
format!(
"could not set permissions in data directory {}",
self.pgdata().display()
)
})
}
// Write postgresql.conf with default configuration
// and PG_VERSION file to the data directory of a new endpoint.
fn setup_pg_conf(&self) -> Result<()> {
let mut conf = PostgresConf::new();
conf.append("max_wal_senders", "10");
conf.append("wal_log_hints", "off");
@@ -242,14 +286,25 @@ impl Endpoint {
// wal_sender_timeout is the maximum time to wait for WAL replication.
// It also defines how often the walreciever will send a feedback message to the wal sender.
conf.append("wal_sender_timeout", "5s");
conf.append("listen_addresses", &self.pg_address.ip().to_string());
conf.append("port", &self.pg_address.port().to_string());
conf.append("listen_addresses", &self.address.ip().to_string());
conf.append("port", &self.address.port().to_string());
conf.append("wal_keep_size", "0");
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
conf.append("restart_after_crash", "off");
// Load the 'neon' extension
// Configure the Neon Postgres extension to fetch pages from pageserver
let pageserver_connstr = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
conf.append("shared_preload_libraries", "neon");
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
conf.append_line("");
// Replication-related configurations, such as WAL sending
@@ -326,19 +381,49 @@ impl Endpoint {
conf.append("primary_conninfo", connstr.as_str());
conf.append("primary_slot_name", slot_name.as_str());
conf.append("hot_standby", "on");
// prefetching of blocks referenced in WAL doesn't make sense for us
// Neon hot standby ignores pages that are not in the shared_buffers
if self.pg_version >= 15 {
conf.append("recovery_prefetch", "off");
}
}
}
Ok(conf)
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
file.write_all(conf.to_string().as_bytes())?;
let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
file.write_all(self.pg_version.to_string().as_bytes())?;
Ok(())
}
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
let backup_lsn = match &self.mode {
ComputeMode::Primary => {
if !self.env.safekeepers.is_empty() {
// LSN 0 means that it is bootstrap and we need to download just
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
// when things would be more stable (TODO).
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
if lsn == Lsn(0) {
None
} else {
Some(lsn)
}
} else {
None
}
}
ComputeMode::Static(lsn) => Some(*lsn),
ComputeMode::Replica => {
None // Take the latest snapshot available to start with
}
};
self.do_basebackup(backup_lsn)?;
Ok(())
}
pub fn endpoint_path(&self) -> PathBuf {
self.env.endpoints_path().join(&self.endpoint_id)
self.env.endpoints_path().join(&self.name)
}
pub fn pgdata(&self) -> PathBuf {
@@ -348,7 +433,7 @@ impl Endpoint {
pub fn status(&self) -> &str {
let timeout = Duration::from_millis(300);
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
match (has_pidfile, can_connect) {
(true, true) => "running",
@@ -366,6 +451,8 @@ impl Endpoint {
&[
"-D",
self.pgdata().to_str().unwrap(),
"-l",
self.pgdata().join("pg.log").to_str().unwrap(),
"-w", //wait till pg_ctl actually does what was asked
],
args,
@@ -401,183 +488,36 @@ impl Endpoint {
Ok(())
}
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
// memory. We will include it in the spec file that we pass to
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
// in the data directory.
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
Ok(content) => String::from_utf8(content)?,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
Err(e) => {
return Err(anyhow::Error::new(e).context(format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)))
}
};
// We always start the compute node from scratch, so if the Postgres
// data dir exists from a previous launch, remove it first.
if self.pgdata().exists() {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
let mut safekeeper_connstrings = Vec::new();
if self.mode == ComputeMode::Primary {
for sk_id in safekeepers {
let sk = self
.env
.safekeepers
.iter()
.find(|node| node.id == sk_id)
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
}
}
// Create spec file
let spec = ComputeSpec {
format_version: 1.0,
operation_uuid: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
let logfile = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
.pg_bin_dir(self.pg_version)?
.join("postgres")
.to_str()
.unwrap(),
])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
let _child = cmd.spawn()?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
loop {
attempt += 1;
match self.get_status() {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
ComputeStatus::Running => {
// All good!
break;
}
ComputeStatus::Failed => {
bail!(
"compute startup failed: {}",
state
.error
.as_deref()
.unwrap_or("<no error from compute_ctl>")
);
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context(
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
);
}
}
}
std::thread::sleep(ATTEMPT_INTERVAL);
}
Ok(())
}
// Call the /status HTTP API
pub fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::blocking::Client::new();
let response = client
.request(
reqwest::Method::GET,
format!(
"http://{}:{}/status",
self.http_address.ip(),
self.http_address.port()
),
// 1. We always start Postgres from scratch, so
// if old dir exists, preserve 'postgresql.conf' and drop the directory
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)
.send()?;
})?;
fs::remove_dir_all(self.pgdata())?;
self.create_pgdata()?;
// Interpret the response
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(response.json()?)
} else {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = response.url().to_owned();
let msg = match response.text() {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
Err(anyhow::anyhow!(msg))
// 2. Bring back config files
fs::write(&postgresql_conf_path, postgresql_conf)?;
// 3. Load basebackup
self.load_basebackup(auth_token)?;
if self.mode != ComputeMode::Primary {
File::create(self.pgdata().join("standby.signal"))?;
}
// 4. Finally start postgres
println!("Starting postgres at '{}'", self.connstr());
self.pg_ctl(&["start"], auth_token)
}
pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -594,7 +534,7 @@ impl Endpoint {
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
std::fs::remove_dir_all(self.endpoint_path())?;
fs::remove_dir_all(self.endpoint_path())?;
} else {
self.pg_ctl(&["stop"], &None)?;
}
@@ -603,10 +543,10 @@ impl Endpoint {
pub fn connstr(&self) -> String {
format!(
"postgresql://{}@{}:{}/{}",
"host={} port={} user={} dbname={}",
self.address.ip(),
self.address.port(),
"cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
"postgres"
)
}

View File

@@ -24,7 +24,7 @@ use utils::{
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 15;
pub const DEFAULT_PG_VERSION: u32 = 14;
//
// This data structures represents neon_local CLI config
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute endpoints).
// compute nodes).
//
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or

View File

@@ -1,9 +1,3 @@
//! Code to manage pageservers
//!
//! In the local test environment, the pageserver stores its data directly in
//!
//! .neon/
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::fs::File;
@@ -14,7 +8,9 @@ use std::process::{Child, Command};
use std::{io, result};
use anyhow::{bail, Context};
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -320,8 +316,8 @@ impl PageServerNode {
settings: HashMap<&str, &str>,
) -> anyhow::Result<TenantId> {
let mut settings = settings.clone();
let config = models::TenantConfig {
let request = TenantCreateRequest {
new_tenant_id,
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
@@ -375,19 +371,6 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
};
// If tenant ID was not specified, generate one
let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
let request = models::TenantCreateRequest {
new_tenant_id,
config,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -408,86 +391,67 @@ impl PageServerNode {
})
}
pub fn tenant_config(
&self,
tenant_id: TenantId,
mut settings: HashMap<&str, &str>,
) -> anyhow::Result<()> {
let config = {
// Braces to make the diff easier to read
models::TenantConfig {
pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
.json(&TenantConfigRequest {
tenant_id,
checkpoint_distance: settings
.remove("checkpoint_distance")
.get("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.get("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.get("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
gc_horizon: settings
.remove("gc_horizon")
.get("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
gc_period: settings.get("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.get("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings
.remove("lagging_wal_timeout")
.get("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
max_lsn_wal_lag: settings
.remove("max_lsn_wal_lag")
.get("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.get("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
eviction_policy: settings
.remove("eviction_policy")
.map(serde_json::from_str)
.get("eviction_policy")
.map(|x| serde_json::from_str(x))
.transpose()
.context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings
.remove("min_resident_size_override")
.get("min_resident_size_override")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'min_resident_size_override' as an integer")?,
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.get("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
}
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
.json(&models::TenantConfigRequest { tenant_id, config })
})
.send()?
.error_from_body()?;
@@ -515,14 +479,11 @@ impl PageServerNode {
ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
self.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)?
.json(&models::TimelineCreateRequest {
.json(&TimelineCreateRequest {
new_timeline_id,
ancestor_start_lsn,
ancestor_timeline_id,

View File

@@ -1,9 +1,3 @@
//! Code to manage safekeepers
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! .neon/safekeepers/<safekeeper id>
//!
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;

View File

@@ -1,14 +1,6 @@
#!/bin/bash
set -eux
# Generate a random tenant or timeline ID
#
# Takes a variable name as argument. The result is stored in that variable.
generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
PG_VERSION=${PG_VERSION:-14}
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -21,29 +13,29 @@ done
echo "Page server is ready."
echo "Create a tenant and timeline"
generate_id tenant_id
PARAMS=(
-sb
-X POST
-H "Content-Type: application/json"
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
-d "{}"
http://pageserver:9898/v1/tenant/
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .
tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
generate_id timeline_id
PARAMS=(
-sb
-X POST
-H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
-d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .
echo "Overwrite tenant id and timeline id in spec file"
tenant_id=$(echo ${result} | jq -r .tenant_id)
timeline_id=$(echo ${result} | jq -r .timeline_id)
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}

View File

@@ -4,11 +4,6 @@ The pageserver uses Tokio for handling concurrency. Everything runs in
Tokio tasks, although some parts are written in blocking style and use
spawn_blocking().
We currently use std blocking functions for disk I/O, however. The
current model is that we consider disk I/Os to be short enough that we
perform them while running in a Tokio task. Changing all the disk I/O
calls to async is a TODO.
Each Tokio task is tracked by the `task_mgr` module. It maintains a
registry of tasks, and which tenant or timeline they are operating
on.
@@ -26,84 +21,19 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!`
or similar, to wake up on shutdown.
### Async cancellation safety
### Sync vs async
In async Rust, futures can be "cancelled" at any await point, by
dropping the Future. For example, `tokio::select!` returns as soon as
one of the Futures returns, and drops the others. `tokio::timeout!` is
another example. In the Rust ecosystem, some functions are
cancellation-safe, meaning they can be safely dropped without
side-effects, while others are not. See documentation of
`tokio::select!` for examples.
We use async to wait for incoming data on network connections, and to
perform other long-running operations. For example, each WAL receiver
connection is handled by a tokio Task. Once a piece of WAL has been
received from the network, the task calls the blocking functions in
the Repository to process the WAL.
In the pageserver and safekeeper, async code is *not*
cancellation-safe by default. Unless otherwise marked, any async
function that you call cannot be assumed to be async
cancellation-safe, and must be polled to completion.
The core storage code in `layered_repository/` is synchronous, with
blocking locks and I/O calls. The current model is that we consider
disk I/Os to be short enough that we perform them while running in a
Tokio task. If that becomes a problem, we should use `spawn_blocking`
before entering the synchronous parts of the code, or switch to using
tokio I/O functions.
The downside of non-cancellation safe code is that you have to be very
careful when using `tokio::select!`, `tokio::timeout!`, and other such
functions that can cause a Future to be dropped. They can only be used
with functions that are explicitly documented to be cancellation-safe,
or you need to spawn a separate task to shield from the cancellation.
At the entry points to the code, we also take care to poll futures to
completion, or shield the rest of the code from surprise cancellations
by spawning a separate task. The code that handles incoming HTTP
requests, for example, spawns a separate task for each request,
because Hyper will drop the request-handling Future if the HTTP
connection is lost.
#### How to cancel, then?
If our code is not cancellation-safe, how do you cancel long-running
tasks? Use CancellationTokens.
TODO: More details on that. And we have an ongoing discussion on what
to do if cancellations might come from multiple sources.
#### Exceptions
Some library functions are cancellation-safe, and are explicitly marked
as such. For example, `utils::seqwait`.
#### Rationale
The alternative would be to make all async code cancellation-safe,
unless otherwise marked. That way, you could use `tokio::select!` more
liberally. The reasons we didn't choose that are explained in this
section.
Writing code in a cancellation-safe manner is tedious, as you need to
scrutinize every `.await` and ensure that if the `.await` call never
returns, the system is in a safe, consistent state. In some ways, you
need to do that with `?` and early `returns`, too, but `.await`s are
easier to miss. It is also easier to perform cleanup tasks when a
function returns an `Err` than when an `.await` simply never
returns. You can use `scopeguard` and Drop guards to perform cleanup
tasks, but it is more tedious. An `.await` that never returns is more
similar to a panic.
Note that even if you only use building blocks that themselves are
cancellation-safe, it doesn't mean that the code as whole is
cancellation-safe. For example, consider the following code:
```
while let Some(i) = work_inbox.recv().await {
if let Err(_) = results_outbox.send(i).await {
println!("receiver dropped");
return;
}
}
}
```
It reads messages from one channel, sends them to another channel. If
this code is cancelled at the `results_outbox.send(i).await`, the
message read from the receiver is lost. That may or may not be OK,
depending on the context.
Another reason to not require cancellation-safety is historical: we
already had a lot of async code that was not scrutinized for
cancellation-safety when this issue was raised. Scrutinizing all
existing code is no fun.
Be very careful when mixing sync and async code!

View File

@@ -1,232 +0,0 @@
# The state of pageserver tenant relocation
Created on 17.03.23
## Motivation
There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc.
Related (in chronological order):
- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886)
- [015. Storage Messaging](015-storage-messaging.md)
- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md)
## Summary
The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches.
1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime.
2. More complicated approach that avoids even short downtime.
3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers).
The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3.
## Components
pageserver, control-plane, safekeepers (a bit)
## Requirements
Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine.
- component restarts
- component outage
- pageserver loss
## The original proposed implementation
The starting point is this sequence:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS2: Attach tenant X
PS2->>S3: Fetch timelines, indexes for them
PS2->>CP: Accepted
CP->>CP: Change pageserver id in project
CP->>PS1: Detach
```
Which problems do we have with naive approach?
### Concurrent GC and Compaction
The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md)
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
participant PS2
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
PS2->>S3: Attach called, sees L1, L2
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
note over S3: Index now L2, L3
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
note over S3: Index now L1, L2, L4
```
At this point it is not possible to restore the state from index, it contains L2 which
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart, initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests.
#### Options
There are several options on how to restrict concurrent access to index file.
First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads.
So the sequence becomes:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X.
PS2->>S3: Fetch timelines, index, start background operations
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>PS1: Detach
```
The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver?
There are two questions:
#### How can we detect that something went wrong?
We can run usual availability check (consists of compute startup and an update of one row).
Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one.
#### What can go wrong? And how we can safely roll-back?
In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state.
Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state.
So the sequence for happy path becomes this one:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X in remote readonly mode.
PS2->>S3: Fetch timelines, index
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>CP: Run successful availability check
CP->>PS2: Start uploads, background tasks
CP->>PS1: Detach
```
With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver.
The sequence with roll back process:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X in remote readonly mode.
PS2->>S3: Fetch timelines, index
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>CP: Availability check Failed
CP->>CP: Change pageserver id back
CP->>PS1: Resume remote operations
CP->>PS2: Ignore (instead of detach for investigation purposes)
```
## Concurrent branch creation
Another problem is a possibility of concurrent branch creation calls.
I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
## Simplistic approach
The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
The happy path sequence:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant SK as Safekeeper
participant S3
CP->>CP: Enable maintenance mode
CP->>PS1: Ignore
CP->>PS2: Attach
PS2->>CP: Accepted
loop Delete layers for each timeline
CP->>PS2: Get last record lsn
CP->>SK: Get commit lsn
CP->>CP: OK? Timed out?
end
CP->>CP: Change pageserver id in project
CP->>CP: Run successful availability check
CP->>CP: Disable maintenance mode
CP->>PS1: Detach ignored
```
The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way.
Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one.
## Lease based approach
In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration.
NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach
The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role.
There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required.
The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers.
There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one.
## Summary
Proposed implementation strategy:
Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation.
And only then go to lease based approach to solve HA/Pageserver replica use cases.

View File

@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
use crate::spec::ComputeSpec;
#[derive(Serialize, Debug, Deserialize)]
#[derive(Serialize, Debug)]
pub struct GenericAPIError {
pub error: String,
}
/// Response of the /status API
#[derive(Serialize, Debug, Deserialize)]
#[derive(Serialize, Debug)]
#[serde(rename_all = "snake_case")]
pub struct ComputeStatusResponse {
pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
pub error: Option<String>,
}
#[derive(Deserialize, Serialize)]
#[derive(Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
pub error: Option<String>,
}
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
// Spec wasn't provided at start, waiting for it to be

View File

@@ -5,7 +5,6 @@
//! and connect it to the storage nodes.
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
/// String type alias representing Postgres identifier and
@@ -15,7 +14,7 @@ pub type PgIdent = String;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[serde_as]
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
#[derive(Clone, Debug, Default, Deserialize)]
pub struct ComputeSpec {
pub format_version: f32,
@@ -27,32 +26,9 @@ pub struct ComputeSpec {
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
// Information needed to connect to the storage layer.
//
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
//
// Depending on `mode`, this can be a primary read-write node, a read-only
// replica, or a read-only node pinned at an older LSN.
// `safekeeper_connstrings` must be set for a primary.
//
// For backwards compatibility, the control plane may leave out all of
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub pageserver_connstring: Option<String>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
#[serde(default)]
pub mode: ComputeMode,
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>,
}
@@ -71,19 +47,13 @@ pub enum ComputeMode {
Replica,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
#[derive(Clone, Debug, Default, Deserialize)]
pub struct Cluster {
pub cluster_id: Option<String>,
pub name: Option<String>,
pub cluster_id: String,
pub name: String,
pub state: Option<String>,
pub roles: Vec<Role>,
pub databases: Vec<Database>,
/// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
/// tool may add additional settings to the final file.)
pub postgresql_conf: Option<String>,
/// Additional settings that will be appended to the 'postgresql.conf' file.
pub settings: GenericOptions,
}
@@ -93,7 +63,7 @@ pub struct Cluster {
/// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Clone, Debug, Deserialize)]
pub struct DeltaOp {
pub action: String,
pub name: PgIdent,
@@ -102,7 +72,7 @@ pub struct DeltaOp {
/// Rust representation of Postgres role info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Clone, Debug, Deserialize)]
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
@@ -111,7 +81,7 @@ pub struct Role {
/// Rust representation of Postgres database info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Clone, Debug, Deserialize)]
pub struct Database {
pub name: PgIdent,
pub owner: PgIdent,
@@ -121,7 +91,7 @@ pub struct Database {
/// Common type representing both SQL statement params with or without value,
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
/// options like `wal_level = logical`.
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Clone, Debug, Deserialize)]
pub struct GenericOption {
pub name: String,
pub value: Option<String>,

View File

@@ -18,29 +18,7 @@ use crate::reltag::RelTag;
use anyhow::bail;
use bytes::{BufMut, Bytes, BytesMut};
/// The state of a tenant in this pageserver.
///
/// ```mermaid
/// stateDiagram-v2
///
/// [*] --> Loading: spawn_load()
/// [*] --> Attaching: spawn_attach()
///
/// Loading --> Activating: activate()
/// Attaching --> Activating: activate()
/// Activating --> Active: infallible
///
/// Loading --> Broken: load() failure
/// Attaching --> Broken: attach() failure
///
/// Active --> Stopping: set_stopping(), part of shutdown & detach
/// Stopping --> Broken: late error in remove_tenant_from_memory
///
/// Broken --> [*]: ignore / detach / shutdown
/// Stopping --> [*]: remove_from_memory complete
///
/// Active --> Broken: cfg(testing)-only tenant break point
/// ```
/// A state of a tenant in pageserver's memory.
#[derive(
Clone,
PartialEq,
@@ -48,73 +26,51 @@ use bytes::{BufMut, Bytes, BytesMut};
serde::Serialize,
serde::Deserialize,
strum_macros::Display,
strum_macros::EnumString,
strum_macros::EnumVariantNames,
strum_macros::AsRefStr,
strum_macros::IntoStaticStr,
)]
#[serde(tag = "slug", content = "data")]
pub enum TenantState {
/// This tenant is being loaded from local disk.
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
/// This tenant is being loaded from local disk
Loading,
/// This tenant is being attached to the pageserver.
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
/// This tenant is being downloaded from cloud storage.
Attaching,
/// The tenant is transitioning from Loading/Attaching to Active.
///
/// While in this state, the individual timelines are being activated.
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
Activating(ActivatingFrom),
/// The tenant has finished activating and is open for business.
///
/// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
/// Tenant is fully operational
Active,
/// The tenant is recognized by pageserver, but it is being detached or the
/// A tenant is recognized by pageserver, but it is being detached or the
/// system is being shut down.
///
/// Transitions out of this state are possible through `set_broken()`.
Stopping,
/// The tenant is recognized by the pageserver, but can no longer be used for
/// any operations.
///
/// If the tenant fails to load or attach, it will transition to this state
/// and it is guaranteed that no background tasks are running in its name.
///
/// The other way to transition into this state is from `Stopping` state
/// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
/// if the cleanup future executed by `remove_tenant_from_memory()` fails.
/// A tenant is recognized by the pageserver, but can no longer be used for
/// any operations, because it failed to be activated.
Broken { reason: String, backtrace: String },
}
impl TenantState {
pub fn attachment_status(&self) -> TenantAttachmentStatus {
use TenantAttachmentStatus::*;
// Below TenantState::Activating is used as "transient" or "transparent" state for
// attachment_status determining.
match self {
// The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
// So, technically, we can return Attached here.
// However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
// But, our attach task might still be fetching the remote timelines, etc.
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
Self::Attaching => Maybe,
// tenant mgr startup distinguishes attaching from loading via marker file.
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
Self::Loading => Attached,
// We only reach Active after successful load / attach.
// So, call atttachment status Attached.
Self::Active => Attached,
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
// However, it also becomes Broken if the regular load fails.
// From Console's perspective there's no practical difference
// because attachment_status is polled by console only during attach operation execution.
Self::Broken { reason, .. } => Failed {
reason: reason.to_owned(),
},
// We would need a separate TenantState variant to distinguish these cases.
// However, there's no practical difference from Console's perspective.
// It will run a Postgres-level health check as soon as it observes Attached.
// That will fail on Broken tenants.
// Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
Self::Broken { .. } => Attached,
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
// we set the Stopping state irrespective of whether the tenant
// has finished attaching or not.
@@ -142,15 +98,6 @@ impl std::fmt::Debug for TenantState {
}
}
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ActivatingFrom {
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
Loading,
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
Attaching,
}
/// A state of a timeline in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TimelineState {
@@ -171,8 +118,9 @@ pub enum TimelineState {
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_timeline_id: TimelineId,
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub new_timeline_id: Option<TimelineId>,
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>,
@@ -183,25 +131,11 @@ pub struct TimelineCreateRequest {
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
#[derive(Serialize, Deserialize, Default)]
pub struct TenantCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_tenant_id: TenantId,
#[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
impl std::ops::Deref for TenantCreateRequest {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.config
}
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantConfig {
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub new_tenant_id: Option<TenantId>,
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
@@ -222,7 +156,6 @@ pub struct TenantConfig {
pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
}
#[serde_as]
@@ -236,35 +169,46 @@ pub struct StatusResponse {
}
impl TenantCreateRequest {
pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
TenantCreateRequest {
new_tenant_id,
config: TenantConfig::default(),
..Default::default()
}
}
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
#[derive(Serialize, Deserialize)]
pub struct TenantConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
impl std::ops::Deref for TenantConfigRequest {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.config
}
#[serde(default)]
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
pub pitr_interval: Option<String>,
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
// We defer the parsing of the eviction_policy field to the request handler.
// Otherwise we'd have to move the types for eviction policy into this package.
// We might do that once the eviction feature has stabilizied.
// For now, this field is not even documented in the openapi_spec.yml.
pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
}
impl TenantConfigRequest {
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
let config = TenantConfig {
TenantConfigRequest {
tenant_id,
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
@@ -281,41 +225,16 @@ impl TenantConfigRequest {
eviction_policy: None,
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
gc_feedback: None,
};
TenantConfigRequest { tenant_id, config }
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TenantAttachRequest {
pub config: TenantAttachConfig,
}
/// Newtype to enforce deny_unknown_fields on TenantConfig for
/// its usage inside `TenantAttachRequest`.
#[derive(Debug, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct TenantAttachConfig {
#[serde(flatten)]
allowing_unknown_fields: TenantConfig,
}
impl std::ops::Deref for TenantAttachConfig {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.allowing_unknown_fields
}
}
}
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
#[derive(Serialize, Deserialize, Clone)]
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum TenantAttachmentStatus {
Maybe,
Attached,
Failed { reason: String },
}
#[serde_as]
@@ -809,9 +728,7 @@ mod tests {
"slug": "Active",
},
"current_physical_size": 42,
"attachment_status": {
"slug":"attached",
}
"attachment_status": "attached",
});
let original_broken = TenantInfo {
@@ -833,9 +750,7 @@ mod tests {
}
},
"current_physical_size": 42,
"attachment_status": {
"slug":"attached",
}
"attachment_status": "attached",
});
assert_eq!(
@@ -850,94 +765,4 @@ mod tests {
assert!(format!("{:?}", &original_broken.state).contains("reason"));
assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
}
#[test]
fn test_reject_unknown_field() {
let id = TenantId::generate();
let create_request = json!({
"new_tenant_id": id.to_string(),
"unknown_field": "unknown_value".to_string(),
});
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
let id = TenantId::generate();
let config_request = json!({
"tenant_id": id.to_string(),
"unknown_field": "unknown_value".to_string(),
});
let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
let attach_request = json!({
"config": {
"unknown_field": "unknown_value".to_string(),
},
});
let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
}
#[test]
fn tenantstatus_activating_serde() {
let states = [
TenantState::Activating(ActivatingFrom::Loading),
TenantState::Activating(ActivatingFrom::Attaching),
];
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
let actual = serde_json::to_string(&states).unwrap();
assert_eq!(actual, expected);
let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
assert_eq!(states.as_slice(), &parsed);
}
#[test]
fn tenantstatus_activating_strum() {
// tests added, because we use these for metrics
let examples = [
(line!(), TenantState::Loading, "Loading"),
(line!(), TenantState::Attaching, "Attaching"),
(
line!(),
TenantState::Activating(ActivatingFrom::Loading),
"Activating",
),
(
line!(),
TenantState::Activating(ActivatingFrom::Attaching),
"Activating",
),
(line!(), TenantState::Active, "Active"),
(line!(), TenantState::Stopping, "Stopping"),
(
line!(),
TenantState::Broken {
reason: "Example".into(),
backtrace: "Looooong backtrace".into(),
},
"Broken",
),
];
for (line, rendered, expected) in examples {
let actual: &'static str = rendered.into();
assert_eq!(actual, expected, "example on {line}");
}
}
}

View File

@@ -24,6 +24,7 @@ workspace_hack.workspace = true
[dev-dependencies]
env_logger.workspace = true
postgres.workspace = true
wal_craft = { path = "wal_craft" }
[build-dependencies]
anyhow.workspace = true

View File

@@ -33,7 +33,6 @@ macro_rules! postgres_ffi {
}
pub mod controlfile_utils;
pub mod nonrelfile_utils;
pub mod wal_craft_test_export;
pub mod waldecoder_handler;
pub mod xlog_utils;
@@ -46,15 +45,8 @@ macro_rules! postgres_ffi {
};
}
#[macro_export]
macro_rules! for_all_postgres_versions {
($macro:tt) => {
$macro!(v14);
$macro!(v15);
};
}
for_all_postgres_versions! { postgres_ffi }
postgres_ffi!(v14);
postgres_ffi!(v15);
pub mod pg_constants;
pub mod relfile_utils;

View File

@@ -1,6 +0,0 @@
//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
pub use super::PG_MAJORVERSION;
pub use super::xlog_utils::*;
pub use super::bindings::*;
pub use crate::WAL_SEGMENT_SIZE;

View File

@@ -481,4 +481,220 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
wal
}
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
#[cfg(test)]
mod tests {
use super::super::PG_MAJORVERSION;
use super::*;
use regex::Regex;
use std::cmp::min;
use std::fs;
use std::{env, str::FromStr};
use utils::const_assert;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
use wal_craft::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}
}

View File

@@ -15,7 +15,3 @@ postgres_ffi.workspace = true
tempfile.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
regex.workspace = true
utils.workspace = true

View File

@@ -10,20 +10,6 @@ use std::process::Command;
use std::time::{Duration, Instant};
use tempfile::{tempdir, TempDir};
macro_rules! xlog_utils_test {
($version:ident) => {
#[path = "."]
mod $version {
pub use postgres_ffi::$version::wal_craft_test_export::*;
#[allow(clippy::duplicate_mod)]
#[cfg(test)]
mod xlog_utils_test;
}
};
}
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Conf {
pub pg_version: u32,

View File

@@ -1,219 +0,0 @@
//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
use super::*;
use crate::{error, info};
use regex::Regex;
use std::cmp::min;
use std::fs::{self, File};
use std::io::Write;
use std::{env, str::FromStr};
use utils::const_assert;
use utils::lsn::Lsn;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
use crate::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}

View File

@@ -12,7 +12,6 @@ aws-smithy-http.workspace = true
aws-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
hyper = { workspace = true, features = ["stream"] }
serde.workspace = true
serde_json.workspace = true

View File

@@ -9,15 +9,14 @@ use std::sync::Arc;
use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
};
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
config::{Config, Region},
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
Client,
config::Config,
error::{GetObjectError, GetObjectErrorKind},
types::{ByteStream, SdkError},
Client, Endpoint, Region,
};
use aws_smithy_http::body::SdkBody;
use hyper::Body;
@@ -126,23 +125,28 @@ impl S3Bucket {
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
let env_creds = EnvironmentVariableCredentialsProvider::new();
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
let imds = ImdsCredentialsProvider::builder().build();
// finally add caching.
// this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
LazyCachingCredentialsProvider::builder()
.load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
.build()
};
let mut config_builder = Config::builder()
.region(Region::new(aws_config.bucket_region.clone()))
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider);
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
let endpoint = Endpoint::immutable(
custom_endpoint
.parse()
.expect("Failed to parse S3 custom endpoint"),
);
config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
}
let client = Client::from_conf(config_builder.build());
@@ -225,9 +229,14 @@ impl S3Bucket {
))),
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
Err(DownloadError::NotFound)
}
Err(SdkError::ServiceError {
err:
GetObjectError {
kind: GetObjectErrorKind::NoSuchKey(..),
..
},
..
}) => Err(DownloadError::NotFound),
Err(e) => {
metrics::inc_get_object_fail();
Err(DownloadError::Other(anyhow::anyhow!(

View File

@@ -1,33 +0,0 @@
use std::sync::Arc;
use tokio::sync::{mpsc, Mutex};
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(mpsc::Sender<()>);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Barrier {
pub async fn wait(self) {
self.0.lock().await.recv().await;
}
pub async fn maybe_wait(barrier: Option<Barrier>) {
if let Some(b) = barrier {
b.wait().await
}
}
}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let (tx, rx) = mpsc::channel::<()>(1);
let rx = Mutex::new(rx);
let rx = Arc::new(rx);
(Completion(tx), Barrier(rx))
}

View File

@@ -1,5 +1,5 @@
use crate::auth::{Claims, JwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use crate::http::error;
use anyhow::{anyhow, Context};
use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue;
@@ -16,6 +16,8 @@ use std::future::Future;
use std::net::TcpListener;
use std::str::FromStr;
use super::error::ApiError;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libmetrics_metric_handler_requests_total",
@@ -33,18 +35,8 @@ struct RequestId(String);
/// Adds a tracing info_span! instrumentation around the handler events,
/// logs the request start and end events for non-GET requests and non-200 responses.
///
/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
///
/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
/// with this will get request info logged in the wrapping span, including the unique request ID.
///
/// This also handles errors, logging them and converting them to an HTTP error response.
///
/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
/// completion. In other words, the handler must be async cancellation safe! request_span
/// prints a warning to the log when that happens, so that you have some trace of it in
/// the log.
///
/// in this type will get request info logged in the wrapping span, including the unique request ID.
///
/// There could be other ways to implement similar functionality:
///
@@ -62,56 +54,60 @@ struct RequestId(String);
/// tries to achive with its `.instrument` used in the current approach.
///
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
pub struct RequestSpan<E, R, H>(pub H)
where
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
H: Fn(Request<Body>) -> R + Send + Sync + 'static;
impl<E, R, H> RequestSpan<E, R, H>
where
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
H: Fn(Request<Body>) -> R + Send + Sync + 'static,
{
let request_id = request.context::<RequestId>().unwrap_or_default().0;
let method = request.method();
let path = request.uri().path();
let request_span = info_span!("request", %method, %path, %request_id);
/// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
/// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
let request_id = request.context::<RequestId>().unwrap_or_default().0;
let method = request.method();
let path = request.uri().path();
let request_span = info_span!("request", %method, %path, %request_id);
let log_quietly = method == Method::GET;
async move {
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
if log_quietly {
debug!("Handling request");
} else {
info!("Handling request");
}
// No special handling for panics here. There's a `tracing_panic_hook` from another
// module to do that globally.
let res = handler(request).await;
cancellation_guard.disarm();
// Log the result if needed.
//
// We also convert any errors into an Ok response with HTTP error code here.
// `make_router` sets a last-resort error handler that would do the same, but
// we prefer to do it here, before we exit the request span, so that the error
// is still logged with the span.
//
// (Because we convert errors to Ok response, we never actually return an error,
// and we could declare the function to return the never type (`!`). However,
// using `routerify::RouterBuilder` requires a proper error type.)
match res {
Ok(response) => {
let response_status = response.status();
if log_quietly && response_status.is_success() {
debug!("Request handled, status: {response_status}");
} else {
info!("Request handled, status: {response_status}");
}
Ok(response)
let log_quietly = method == Method::GET;
async move {
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
if log_quietly {
debug!("Handling request");
} else {
info!("Handling request");
}
// Note that we reuse `error::handler` here and not returning and error at all,
// yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
// Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
//
// Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
let res = (self.0)(request).await;
cancellation_guard.disarm();
match res {
Ok(response) => {
let response_status = response.status();
if log_quietly && response_status.is_success() {
debug!("Request handled, status: {response_status}");
} else {
info!("Request handled, status: {response_status}");
}
Ok(response)
}
Err(e) => Ok(error::handler(e.into()).await),
}
Err(err) => Ok(api_error_handler(err)),
}
.instrument(request_span)
.await
}
.instrument(request_span)
.await
}
/// Drop guard to WARN in case the request was dropped before completion.
@@ -211,8 +207,10 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
.middleware(Middleware::post_with_info(
add_request_id_header_to_response,
))
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.err_handler(route_error_handler)
.get("/metrics", |r| {
RequestSpan(prometheus_metrics_handler).handle(r)
})
.err_handler(error::handler)
}
pub fn attach_openapi_ui(
@@ -222,14 +220,12 @@ pub fn attach_openapi_ui(
ui_mount_path: &'static str,
) -> RouterBuilder<hyper::Body, ApiError> {
router_builder
.get(spec_mount_path,
move |r| request_span(r, move |_| async move {
Ok(Response::builder().body(Body::from(spec)).unwrap())
})
)
.get(ui_mount_path,
move |r| request_span(r, move |_| async move {
Ok(Response::builder().body(Body::from(format!(r#"
.get(spec_mount_path, move |r| {
RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
.handle(r)
})
.get(ui_mount_path, move |r| RequestSpan( move |_| async move {
Ok(Response::builder().body(Body::from(format!(r#"
<!DOCTYPE html>
<html lang="en">
<head>
@@ -259,8 +255,7 @@ pub fn attach_openapi_ui(
</body>
</html>
"#, spec_mount_path))).unwrap())
})
)
}).handle(r))
}
fn parse_token(header_value: &str) -> Result<&str, ApiError> {

View File

@@ -83,24 +83,13 @@ impl HttpErrorBody {
}
}
pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
match err.downcast::<ApiError>() {
Ok(api_error) => api_error_handler(*api_error),
Err(other_error) => {
// We expect all the request handlers to return an ApiError, so this should
// not be reached. But just in case.
error!("Error processing HTTP request: {other_error:?}");
HttpErrorBody::response_from_msg_and_status(
other_error.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
)
}
}
}
pub async fn handler(err: routerify::RouteError) -> Response<Body> {
let api_error = err
.downcast::<ApiError>()
.expect("handler should always return api error");
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors
if let ApiError::InternalServerError(_) = api_error {
if let ApiError::InternalServerError(_) = api_error.as_ref() {
error!("Error processing HTTP request: {api_error:?}");
} else {
error!("Error processing HTTP request: {api_error:#}");

View File

@@ -8,26 +8,12 @@ use super::error::ApiError;
pub async fn json_request<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>,
) -> Result<T, ApiError> {
json_request_or_empty_body(request)
.await?
.context("missing request body")
.map_err(ApiError::BadRequest)
}
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>,
) -> Result<Option<T>, ApiError> {
let body = hyper::body::aggregate(request.body_mut())
let whole_body = hyper::body::aggregate(request.body_mut())
.await
.context("Failed to read request body")
.map_err(ApiError::BadRequest)?;
if body.remaining() == 0 {
return Ok(None);
}
serde_json::from_reader(body.reader())
serde_json::from_reader(whole_body.reader())
.context("Failed to parse json request")
.map(Some)
.map_err(ApiError::BadRequest)
}

View File

@@ -60,9 +60,6 @@ pub mod tracing_span_assert;
pub mod rate_limit;
/// Simple once-barrier and a guard which keeps barrier awaiting.
pub mod completion;
mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)")

View File

@@ -144,8 +144,6 @@ where
///
/// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for.
///
/// This function is async cancellation-safe.
pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
match self.queue_for_wait(num) {
Ok(None) => Ok(()),
@@ -161,8 +159,6 @@ where
///
/// If that hasn't happened after the specified timeout duration,
/// [`SeqWaitError::Timeout`] will be returned.
///
/// This function is async cancellation-safe.
pub async fn wait_for_timeout(
&self,
num: V,

View File

@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
updates.insert_historic(Arc::new(layer));
}
println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
is_incremental: false,
short_id: format!("Layer {}", i),
};
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
updates.insert_historic(Arc::new(layer));
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());

View File

@@ -1,18 +0,0 @@
[package]
name = "pagectl"
version = "0.1.0"
edition.workspace = true
license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow.workspace = true
bytes.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
pageserver = { path = ".." }
postgres_ffi.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true

View File

@@ -1,169 +0,0 @@
use std::path::{Path, PathBuf};
use anyhow::Result;
use clap::Subcommand;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::{page_cache, virtual_file};
use pageserver::{
repository::{Key, KEY_SIZE},
tenant::{
block_io::FileBlockReader, disk_btree::VisitDirection,
storage_layer::delta_layer::DELTA_KEY_SIZE,
},
virtual_file::VirtualFile,
};
use std::fs;
use utils::bin_ser::BeSer;
use crate::layer_map_analyzer::parse_filename;
#[derive(Subcommand)]
pub(crate) enum LayerCmd {
/// List all tenants and timelines under the pageserver path
///
/// Example: `cargo run --bin pagectl layer list .neon/`
List { path: PathBuf },
/// List all layers of a given tenant and timeline
///
/// Example: `cargo run --bin pagectl layer list .neon/`
ListLayer {
path: PathBuf,
tenant: String,
timeline: String,
},
/// Dump all information of a layer file
DumpLayer {
path: PathBuf,
tenant: String,
timeline: String,
/// The id from list-layer command
id: usize,
},
}
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
use pageserver::tenant::blob_io::BlobCursor;
use pageserver::tenant::block_io::BlockReader;
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
actual_summary.index_root_blk,
&file,
);
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
let mut all = vec![];
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value_offset| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
all.push((curr, BlobRef(value_offset)));
true
},
)?;
let mut cursor = BlockCursor::new(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos())?;
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
Ok(())
}
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join("tenants"))? {
let tenant = tenant?;
if !tenant.file_type()?.is_dir() {
continue;
}
println!("tenant {}", tenant.file_name().to_string_lossy());
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
let timeline = timeline?;
if !timeline.file_type()?.is_dir() {
continue;
}
println!("- timeline {}", timeline.file_name().to_string_lossy());
}
}
}
LayerCmd::ListLayer {
path,
tenant,
timeline,
} => {
let timeline_path = path
.join("tenants")
.join(tenant)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {
let layer = layer?;
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
{
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
idx += 1;
}
}
}
LayerCmd::DumpLayer {
path,
tenant,
timeline,
id,
} => {
let timeline_path = path
.join("tenants")
.join(tenant)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {
let layer = layer?;
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
{
if *id == idx {
// TODO(chi): dedup code
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
if layer_file.is_delta {
read_delta_file(layer.path())?;
} else {
anyhow::bail!("not supported yet :(");
}
break;
}
idx += 1;
}
}
}
}
Ok(())
}

View File

@@ -1,179 +0,0 @@
//! A helper tool to manage pageserver binary files.
//! Accepts a file as an argument, attempts to parse it with all ways possible
//! and prints its interpreted context.
//!
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
mod draw_timeline_dir;
mod layer_map_analyzer;
mod layers;
use clap::{Parser, Subcommand};
use layers::LayerCmd;
use pageserver::{
context::{DownloadBehavior, RequestContext},
page_cache,
task_mgr::TaskKind,
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use postgres_ffi::ControlFileData;
use std::path::{Path, PathBuf};
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
#[derive(Parser)]
#[command(
version = GIT_VERSION,
about = "Neon Pageserver binutils",
long_about = "Reads pageserver (and related) binary files management utility"
)]
#[command(propagate_version = true)]
struct CliOpts {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Metadata(MetadataCmd),
PrintLayerFile(PrintLayerFileCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
Layer(LayerCmd),
}
/// Read and update pageserver metadata file
#[derive(Parser)]
struct MetadataCmd {
/// Input metadata file path
metadata_path: PathBuf,
/// Replace disk consistent Lsn
disk_consistent_lsn: Option<Lsn>,
/// Replace previous record Lsn
prev_record_lsn: Option<Lsn>,
/// Replace latest gc cuttoff
latest_gc_cuttoff: Option<Lsn>,
}
#[derive(Parser)]
struct PrintLayerFileCmd {
/// Pageserver data path
path: PathBuf,
}
#[derive(Parser)]
struct AnalyzeLayerMapCmd {
/// Pageserver data path
path: PathBuf,
/// Max holes
max_holes: Option<usize>,
}
fn main() -> anyhow::Result<()> {
let cli = CliOpts::parse();
match cli.command {
Commands::Layer(cmd) => {
layers::main(&cmd)?;
}
Commands::Metadata(cmd) => {
handle_metadata(&cmd)?;
}
Commands::DrawTimeline {} => {
draw_timeline_dir::main()?;
}
Commands::AnalyzeLayerMap(cmd) => {
layer_map_analyzer::main(&cmd)?;
}
Commands::PrintLayerFile(cmd) => {
if let Err(e) = read_pg_control_file(&cmd.path) {
println!(
"Failed to read input file as a pg control one: {e:#}\n\
Attempting to read it as layer file"
);
print_layerfile(&cmd.path)?;
}
}
};
Ok(())
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
"pg_initdb_lsn: {}, aligned: {}",
control_file_initdb,
control_file_initdb.align()
);
Ok(())
}
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx)
}
fn handle_metadata(
MetadataCmd {
metadata_path: path,
disk_consistent_lsn,
prev_record_lsn,
latest_gc_cuttoff,
}: &MetadataCmd,
) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
meta = TimelineMetadata::new(
*disk_consistent_lsn,
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(prev_record_lsn) = prev_record_lsn {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
Some(*prev_record_lsn),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
*latest_gc_cuttoff,
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())
}

View File

@@ -12,7 +12,7 @@
//! Example use:
//! ```
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ firefox out.svg
//! ```
//!
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
(keys, lsns)
}
pub fn main() -> Result<()> {
fn main() -> Result<()> {
// Parse layer filenames from stdin
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
let stdin = io::stdin();

View File

@@ -6,7 +6,7 @@ use anyhow::Result;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::ops::Range;
use std::{fs, path::Path, str};
use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
use pageserver::page_cache::PAGE_SZ;
use pageserver::repository::{Key, KEY_SIZE};
@@ -18,14 +18,12 @@ use pageserver::virtual_file::VirtualFile;
use utils::{bin_ser::BeSer, lsn::Lsn};
use crate::AnalyzeLayerMapCmd;
const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
const DEFAULT_MAX_HOLES: usize = 10;
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(PartialEq, Eq)]
pub struct Hole(Range<Key>);
struct Hole(Range<Key>);
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
@@ -41,11 +39,11 @@ impl PartialOrd for Hole {
}
}
pub(crate) struct LayerFile {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub is_delta: bool,
pub holes: Vec<Hole>,
struct LayerFile {
key_range: Range<Key>,
lsn_range: Range<Lsn>,
is_delta: bool,
holes: Vec<Hole>,
}
impl LayerFile {
@@ -69,7 +67,7 @@ impl LayerFile {
}
}
pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
fn parse_filename(name: &str) -> Option<LayerFile> {
let split: Vec<&str> = name.split("__").collect();
if split.len() != 2 {
return None;
@@ -129,9 +127,18 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
Ok(holes)
}
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let storage_path = &cmd.path;
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
fn main() -> Result<()> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
return Ok(());
}
let storage_path = PathBuf::from_str(&args[1])?;
let max_holes = if args.len() > 2 {
args[2].parse::<usize>().unwrap()
} else {
DEFAULT_MAX_HOLES
};
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10);

View File

@@ -9,7 +9,6 @@ use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage;
use tracing::*;
@@ -19,7 +18,9 @@ use pageserver::{
context::{DownloadBehavior, RequestContext},
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant::mgr,
virtual_file,
};
@@ -275,18 +276,7 @@ fn start_pageserver(
let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Launch broker client
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
let broker_client = WALRECEIVER_RUNTIME
.block_on(async {
// Note: we do not attempt connecting here (but validate endpoints sanity).
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
})
.with_context(|| {
format!(
"create broker client for uri={:?} keepalive_interval={:?}",
&conf.broker_endpoint, conf.broker_keepalive_interval,
)
})?;
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
// Initialize authentication for incoming connections
let http_auth;
@@ -335,118 +325,8 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
// BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
//
// init_done_rx will notify when all initial load operations have completed.
//
// background_jobs_can_start (same name used to hold off background jobs from starting at
// consumer side) will be dropped once we can start the background jobs. Currently it is behind
// completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
// (background_task_maximum_delay).
let (init_done_tx, init_done_rx) = utils::completion::channel();
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
let order = pageserver::InitializationOrder {
initial_tenant_load: Some(init_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: init_logical_size_done_tx,
background_jobs_can_start: background_jobs_barrier.clone(),
};
// Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
broker_client.clone(),
remote_storage.clone(),
order,
))?;
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
init_done_rx.wait().await;
// initial logical sizes can now start, as they were waiting on init_done_rx.
scopeguard::ScopeGuard::into_inner(guard);
let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;
tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed"
);
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
let timeout = conf.background_task_maximum_delay;
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
let init_sizes_done = tokio::select! {
_ = &mut init_sizes_done => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
None
}
_ = tokio::time::sleep(timeout) => {
tracing::info!(
timeout_millis = timeout.as_millis(),
"Initial logical size timeout elapsed; starting background jobs"
);
Some(init_sizes_done)
}
};
scopeguard::ScopeGuard::into_inner(guard);
// allow background jobs to start
drop(background_jobs_can_start);
if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
// consumption metrics.
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
init_sizes_done.await;
scopeguard::ScopeGuard::into_inner(guard);
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
}
};
async move {
let mut drive_init = std::pin::pin!(drive_init);
// just race these tasks
tokio::select! {
_ = shutdown_pageserver.cancelled() => {},
_ = &mut drive_init => {},
}
}
});
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -459,7 +339,6 @@ fn start_pageserver(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
background_jobs_barrier.clone(),
)?;
}
@@ -472,7 +351,6 @@ fn start_pageserver(
conf,
launch_ts,
http_auth,
broker_client.clone(),
remote_storage,
disk_usage_eviction_state,
)?
@@ -497,7 +375,6 @@ fn start_pageserver(
);
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let background_jobs_barrier = background_jobs_barrier;
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.
@@ -513,18 +390,6 @@ fn start_pageserver(
"consumption metrics collection",
true,
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
conf.metric_collection_interval,
@@ -562,7 +427,6 @@ fn start_pageserver(
async move {
page_service::libpq_listener_main(
conf,
broker_client,
pg_auth,
pageserver_listener,
conf.pg_auth_type,
@@ -573,8 +437,6 @@ fn start_pageserver(
);
}
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
ShutdownSignals::handle(|signal| match signal {
Signal::Quit => {
@@ -590,11 +452,6 @@ fn start_pageserver(
"Got {}. Terminating gracefully in fast shutdown mode",
signal.name()
);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}

View File

@@ -0,0 +1,157 @@
//! A helper tool to manage pageserver binary files.
//! Accepts a file as an argument, attempts to parse it with all ways possible
//! and prints its interpreted context.
//!
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
use std::{
path::{Path, PathBuf},
str::FromStr,
};
use anyhow::Context;
use clap::{value_parser, Arg, Command};
use pageserver::{
context::{DownloadBehavior, RequestContext},
page_cache,
task_mgr::TaskKind,
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use postgres_ffi::ControlFileData;
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
const METADATA_SUBCOMMAND: &str = "metadata";
fn main() -> anyhow::Result<()> {
let arg_matches = cli().get_matches();
match arg_matches.subcommand() {
Some((subcommand_name, subcommand_matches)) => {
let path = subcommand_matches
.get_one::<PathBuf>("metadata_path")
.context("'metadata_path' argument is missing")?
.to_path_buf();
anyhow::ensure!(
subcommand_name == METADATA_SUBCOMMAND,
"Unknown subcommand {subcommand_name}"
);
handle_metadata(&path, subcommand_matches)?;
}
None => {
let path = arg_matches
.get_one::<PathBuf>("path")
.context("'path' argument is missing")?
.to_path_buf();
println!(
"No subcommand specified, attempting to guess the format for file {}",
path.display()
);
if let Err(e) = read_pg_control_file(&path) {
println!(
"Failed to read input file as a pg control one: {e:#}\n\
Attempting to read it as layer file"
);
print_layerfile(&path)?;
}
}
};
Ok(())
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
"pg_initdb_lsn: {}, aligned: {}",
control_file_initdb,
control_file_initdb.align()
);
Ok(())
}
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx)
}
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
meta = TimelineMetadata::new(
Lsn::from_str(disk_consistent_lsn)?,
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
Some(Lsn::from_str(prev_record_lsn)?),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())
}
fn cli() -> Command {
Command::new("Neon Pageserver binutils")
.about("Reads pageserver (and related) binary files management utility")
.version(GIT_VERSION)
.arg(
Arg::new("path")
.help("Input file path")
.value_parser(value_parser!(PathBuf))
.required(false),
)
.subcommand(
Command::new(METADATA_SUBCOMMAND)
.about("Read and update pageserver metadata file")
.arg(
Arg::new("metadata_path")
.help("Input metadata file path")
.value_parser(value_parser!(PathBuf))
.required(false),
)
.arg(
Arg::new("disk_consistent_lsn")
.long("disk_consistent_lsn")
.help("Replace disk consistent Lsn"),
)
.arg(
Arg::new("prev_record_lsn")
.long("prev_record_lsn")
.help("Replace previous record Lsn"),
),
)
}
#[test]
fn verify_cli() {
cli().debug_assert();
}

View File

@@ -0,0 +1,48 @@
//! The broker client instance of the pageserver, created during pageserver startup.
//! Used by each timelines' [`walreceiver`].
use crate::config::PageServerConf;
use anyhow::Context;
use once_cell::sync::OnceCell;
use storage_broker::BrokerClientChannel;
use tracing::*;
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
///
/// Initialize the broker client. This must be called once at page server startup.
///
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
let broker_endpoint = conf.broker_endpoint.clone();
// Note: we do not attempt connecting here (but validate endpoints sanity).
let broker_client =
storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
format!(
"Failed to create broker client to {}",
&conf.broker_endpoint
),
)?;
if BROKER_CLIENT.set(broker_client).is_err() {
panic!("broker already initialized");
}
info!(
"Initialized broker client with endpoints: {}",
broker_endpoint
);
Ok(())
}
///
/// Get a handle to the broker client
///
pub fn get_broker_client() -> &'static BrokerClientChannel {
BROKER_CLIENT.get().expect("broker client not initialized")
}
pub fn is_broker_client_initialized() -> bool {
BROKER_CLIENT.get().is_some()
}

View File

@@ -63,7 +63,6 @@ pub mod defaults {
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
///
/// Default built-in configuration file.
@@ -92,9 +91,8 @@ pub mod defaults {
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
@@ -110,7 +108,7 @@ pub mod defaults {
#min_resident_size_override = .. # in bytes
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
# [remote_storage]
"###
@@ -189,15 +187,6 @@ pub struct PageServerConf {
pub test_remote_failures: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool,
/// How long will background tasks be delayed at most after initial load of tenants.
///
/// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
/// as we now isolate initial loading, initial logical size calculation and background tasks.
/// Smaller nodes will have background tasks "not running" for this long unless every timeline
/// has it's initial logical size calculated. Not running background tasks for some seconds is
/// not terrible.
pub background_task_maximum_delay: Duration,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -270,8 +259,6 @@ struct PageServerConfigBuilder {
test_remote_failures: BuilderValue<u64>,
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
background_task_maximum_delay: BuilderValue<Duration>,
}
impl Default for PageServerConfigBuilder {
@@ -329,11 +316,6 @@ impl Default for PageServerConfigBuilder {
test_remote_failures: Set(0),
ondemand_download_behavior_treat_error_as_warn: Set(false),
background_task_maximum_delay: Set(humantime::parse_duration(
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
}
}
}
@@ -458,10 +440,6 @@ impl PageServerConfigBuilder {
BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
}
pub fn background_task_maximum_delay(&mut self, delay: Duration) {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
@@ -544,9 +522,6 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!(
"missing ondemand_download_behavior_treat_error_as_warn"
))?,
background_task_maximum_delay: self
.background_task_maximum_delay
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
})
}
}
@@ -735,7 +710,6 @@ impl PageServerConf {
)
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -823,8 +797,7 @@ impl PageServerConf {
)?);
}
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag =
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =
@@ -854,14 +827,6 @@ impl PageServerConf {
)?);
}
if let Some(gc_feedback) = item.get("gc_feedback") {
t_conf.gc_feedback = Some(
gc_feedback
.as_bool()
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
);
}
Ok(t_conf)
}
@@ -903,7 +868,6 @@ impl PageServerConf {
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
}
}
}
@@ -1063,7 +1027,6 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
synthetic_size_calculation_interval = '333 s'
log_format = 'json'
background_task_maximum_delay = '334 s'
"#;
@@ -1122,9 +1085,6 @@ background_task_maximum_delay = '334 s'
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
},
"Correct defaults should be used when no config values are provided"
);
@@ -1179,7 +1139,6 @@ background_task_maximum_delay = '334 s'
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -88,7 +88,6 @@
use crate::task_mgr::TaskKind;
// The main structure of this module, see module-level comment.
#[derive(Clone, Debug)]
pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
@@ -96,7 +95,7 @@ pub struct RequestContext {
/// Desired behavior if the operation requires an on-demand download
/// to proceed.
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum DownloadBehavior {
/// Download the layer file. It can take a while.
Download,

View File

@@ -54,7 +54,6 @@ use serde::{Deserialize, Serialize};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, instrument, warn, Instrument};
use utils::completion;
use utils::serde_percent::Percent;
use crate::{
@@ -83,7 +82,6 @@ pub fn launch_disk_usage_global_eviction_task(
conf: &'static PageServerConf,
storage: GenericRemoteStorage,
state: Arc<State>,
background_jobs_barrier: completion::Barrier,
) -> anyhow::Result<()> {
let Some(task_config) = &conf.disk_usage_based_eviction else {
info!("disk usage based eviction task not configured");
@@ -100,16 +98,14 @@ pub fn launch_disk_usage_global_eviction_task(
"disk usage based eviction",
false,
async move {
let cancel = task_mgr::shutdown_token();
// wait until initial load is complete, because we cannot evict from loading tenants.
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => { }
};
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
.await;
disk_usage_eviction_task(
&state,
task_config,
storage,
&conf.tenants_path(),
task_mgr::shutdown_token(),
)
.await;
info!("disk usage based eviction task finishing");
Ok(())
},

View File

@@ -363,29 +363,11 @@ paths:
* MUST NOT ASSUME that the request has been lost, based on the observation
that a subsequent tenant status request returns 404. The request may
still be in flight. It must be retried.
The client SHOULD supply a `TenantConfig` for the tenant in the request body.
Settings specified in the config override the pageserver's defaults.
It is guaranteed that the config settings are applied before the pageserver
starts operating on the tenant. E.g., if the config specifies a specific
PITR interval for a tenant, then that setting will be in effect before the
pageserver starts the garbage collection loop. This enables a client to
guarantee a specific PITR setting across detach/attach cycles.
The pageserver will reject the request if it cannot parse the config, or
if there are any unknown fields in it.
If the client does not supply a config, the pageserver will use its defaults.
This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/TenantAttachRequest"
responses:
"202":
description: Tenant attaching scheduled
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
@@ -678,8 +660,6 @@ paths:
application/json:
schema:
type: object
required:
- new_timeline_id
properties:
new_timeline_id:
type: string
@@ -761,16 +741,13 @@ paths:
$ref: "#/components/schemas/Error"
post:
description: |
Create a tenant. Returns new tenant id on success.
Create a tenant. Returns new tenant id on success.\
If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
Invalid fields in the tenant config will cause the request to be rejected with status 400.
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/TenantCreateRequest"
$ref: "#/components/schemas/TenantCreateInfo"
responses:
"201":
description: New tenant created successfully
@@ -813,13 +790,11 @@ paths:
put:
description: |
Update tenant's config.
Invalid fields in the tenant config will cause the request to be rejected with status 400.
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/TenantConfigRequest"
$ref: "#/components/schemas/TenantConfigInfo"
responses:
"200":
description: OK
@@ -871,7 +846,7 @@ paths:
content:
application/json:
schema:
$ref: "#/components/schemas/TenantConfigResponse"
$ref: "#/components/schemas/TenantConfig"
"400":
description: Malformed get tenanant config request
content:
@@ -928,58 +903,41 @@ components:
writing to the tenant's S3 state, so, DO NOT ATTACH the
tenant to any other pageserver, or we risk split-brain.
- `attached` means that the attach operation has completed,
successfully
- `failed` means that attach has failed. For reason check corresponding `reason` failed.
`failed` is the terminal state, retrying attach call wont resolve the issue.
For example this can be caused by s3 being unreachable. The retry may be implemented
with call to detach, though it would be better to not automate it and inspec failed state
manually before proceeding with a retry.
maybe successfully, maybe not. Perform a health check at
the Postgres level to determine healthiness of the tenant.
See the tenant `/attach` endpoint for more information.
type: object
required:
- slug
- data
properties:
slug:
type: string
enum: [ "maybe", "attached", "failed" ]
data:
- type: object
properties:
reason:
type: string
TenantCreateRequest:
allOf:
- $ref: '#/components/schemas/TenantConfig'
- type: object
required:
- new_tenant_id
properties:
new_tenant_id:
type: string
format: hex
TenantAttachRequest:
type: object
required:
- config
properties:
config:
$ref: '#/components/schemas/TenantConfig'
TenantConfigRequest:
allOf:
- $ref: '#/components/schemas/TenantConfig'
- type: object
required:
- tenant_id
properties:
tenant_id:
type: string
format: hex
TenantConfig:
type: string
enum: [ "maybe", "attached" ]
TenantCreateInfo:
type: object
properties:
new_tenant_id:
type: string
format: hex
tenant_id:
type: string
format: hex
gc_period:
type: string
gc_horizon:
type: integer
pitr_interval:
type: string
checkpoint_distance:
type: integer
checkpoint_timeout:
type: string
compaction_period:
type: string
compaction_threshold:
type: string
TenantConfigInfo:
type: object
properties:
tenant_id:
type: string
format: hex
gc_period:
type: string
gc_horizon:
@@ -1006,13 +964,13 @@ components:
type: integer
trace_read_requests:
type: boolean
TenantConfigResponse:
TenantConfig:
type: object
properties:
tenant_specific_overrides:
$ref: "#/components/schemas/TenantConfig"
$ref: "#/components/schemas/TenantConfigInfo"
effective_config:
$ref: "#/components/schemas/TenantConfig"
$ref: "#/components/schemas/TenantConfigInfo"
TimelineInfo:
type: object
required:

View File

@@ -1,6 +1,3 @@
//!
//! Management HTTP API
//!
use std::collections::HashMap;
use std::sync::Arc;
@@ -8,14 +5,12 @@ use anyhow::{anyhow, Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
use remote_storage::GenericRemoteStorage;
use storage_broker::BrokerClientChannel;
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::endpoint::RequestSpan;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use super::models::{
@@ -24,13 +19,10 @@ use super::models::{
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::disk_usage_eviction_task;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
};
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -49,6 +41,7 @@ use utils::{
};
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::ConfigureFailpointsRequest;
struct State {
@@ -56,7 +49,6 @@ struct State {
auth: Option<Arc<JwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
}
@@ -65,7 +57,6 @@ impl State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
@@ -77,7 +68,6 @@ impl State {
auth,
allowlist_routes,
remote_storage,
broker_client,
disk_usage_eviction_state,
})
}
@@ -148,36 +138,6 @@ impl From<TenantStateError> for ApiError {
}
}
impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
e @ GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
// in fact exist locally. If we did, the caller could draw the conclusion
// that it can attach the tenant to another PS and we'd be in split-brain.
//
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
}
impl From<SetNewTenantConfigError> for ApiError {
fn from(e: SetNewTenantConfigError) -> ApiError {
match e {
SetNewTenantConfigError::GetTenant(tid) => {
ApiError::NotFound(anyhow!("tenant {}", tid))
}
e @ SetNewTenantConfigError::Persist(_) => {
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
}
impl From<crate::tenant::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
use crate::tenant::DeleteTimelineError::*;
@@ -197,7 +157,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
match value {
// Report Precondition failed so client can distinguish between
// "tenant is missing" case from "timeline is missing"
Tenant(GetTenantError::NotFound(..)) => {
Tenant(TenantStateError::NotFound(..)) => {
ApiError::PreconditionFailed("Requested tenant is missing")
}
Tenant(t) => ApiError::from(t),
@@ -292,29 +252,23 @@ fn build_timeline_info_common(
}
// healthcheck handler
async fn status_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let config = get_config(&request);
json_response(StatusCode::OK, StatusResponse { id: config.id })
}
async fn timeline_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_id))?;
let new_timeline_id = request_data.new_timeline_id;
let new_timeline_id = request_data
.new_timeline_id
.unwrap_or_else(TimelineId::generate);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
let state = get_state(&request);
async {
let tenant = mgr::get_tenant(tenant_id, true).await?;
match tenant.create_timeline(
@@ -322,7 +276,6 @@ async fn timeline_create_handler(
request_data.ancestor_timeline_id.map(TimelineId::from),
request_data.ancestor_start_lsn,
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
state.broker_client.clone(),
&ctx,
)
.await {
@@ -336,14 +289,11 @@ async fn timeline_create_handler(
Err(err) => Err(ApiError::InternalServerError(err)),
}
}
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
.await
}
async fn timeline_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
@@ -377,10 +327,7 @@ async fn timeline_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size: Option<bool> =
@@ -414,10 +361,7 @@ async fn timeline_detail_handler(
json_response(StatusCode::OK, timeline_info)
}
async fn get_lsn_by_timestamp_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -441,19 +385,11 @@ async fn get_lsn_by_timestamp_handler(
json_response(StatusCode::OK, result)
}
async fn tenant_attach_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
let tenant_conf = match maybe_body {
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
None => TenantConfOpt::default(),
};
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
info!("Handling tenant attach {tenant_id}");
@@ -461,16 +397,9 @@ async fn tenant_attach_handler(
let state = get_state(&request);
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(
state.conf,
tenant_id,
tenant_conf,
state.broker_client.clone(),
remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await?;
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
@@ -480,10 +409,7 @@ async fn tenant_attach_handler(
json_response(StatusCode::ACCEPTED, ())
}
async fn timeline_delete_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -497,10 +423,7 @@ async fn timeline_delete_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_detach_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -514,33 +437,21 @@ async fn tenant_detach_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let state = get_state(&request);
mgr::load_tenant(
state.conf,
tenant_id,
state.broker_client.clone(),
state.remote_storage.clone(),
&ctx,
)
.instrument(info_span!("load", tenant = %tenant_id))
.await?;
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
.instrument(info_span!("load", tenant = %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_ignore_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -553,10 +464,7 @@ async fn tenant_ignore_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let response_data = mgr::list_tenants()
@@ -576,10 +484,7 @@ async fn tenant_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn tenant_status(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -593,7 +498,7 @@ async fn tenant_status(
}
let state = tenant.current_state();
Result::<_, ApiError>::Ok(TenantInfo {
Ok(TenantInfo {
id: tenant_id,
state: state.clone(),
current_physical_size: Some(current_physical_size),
@@ -601,7 +506,8 @@ async fn tenant_status(
})
}
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
.await?;
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, tenant_info)
}
@@ -619,10 +525,7 @@ async fn tenant_status(
/// Note: we don't update the cached size and prometheus metric here.
/// The retention period might be different, and it's nice to have a method to just calculate it
/// without modifying anything anyway.
async fn tenant_size_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -687,10 +590,7 @@ async fn tenant_size_handler(
)
}
async fn layer_map_info_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let reset: LayerAccessStatsReset =
@@ -704,10 +604,7 @@ async fn layer_map_info_handler(
json_response(StatusCode::OK, layer_map_info)
}
async fn layer_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -730,10 +627,7 @@ async fn layer_download_handler(
}
}
async fn evict_timeline_layer_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -811,31 +705,26 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
Ok(response)
}
async fn tenant_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let target_tenant_id = request_data.new_tenant_id;
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let _timer = STORAGE_TIME_GLOBAL
.get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
.expect("bug")
.start_timer();
let tenant_conf =
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;
let target_tenant_id = request_data
.new_tenant_id
.map(TenantId::from)
.unwrap_or_else(TenantId::generate);
let state = get_state(&request);
let new_tenant = mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
state.broker_client.clone(),
state.remote_storage.clone(),
&ctx,
)
@@ -854,17 +743,13 @@ async fn tenant_create_handler(
res.context("created tenant failed to become active")
.map_err(ApiError::InternalServerError)?;
}
json_response(
StatusCode::CREATED,
TenantCreateResponse(new_tenant.tenant_id()),
)
}
async fn get_tenant_config_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -890,14 +775,12 @@ async fn get_tenant_config_handler(
async fn update_tenant_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantConfigRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
let tenant_conf =
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;
let state = get_state(&request);
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
@@ -908,25 +791,21 @@ async fn update_tenant_config_handler(
}
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
async fn handle_tenant_break(
r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.await
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await;
tenant.set_broken("broken from test".to_owned());
json_response(StatusCode::OK, ())
}
async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
"Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -959,10 +838,7 @@ async fn failpoints_handler(
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -981,10 +857,8 @@ async fn timeline_gc_handler(
}
// Run compaction immediately on given timeline.
async fn timeline_compact_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1005,10 +879,8 @@ async fn timeline_compact_handler(
}
// Run checkpoint immediately on given timeline.
async fn timeline_checkpoint_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1032,7 +904,6 @@ async fn timeline_checkpoint_handler(
async fn timeline_download_remote_layers_handler_post(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1048,7 +919,6 @@ async fn timeline_download_remote_layers_handler_post(
async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1072,10 +942,7 @@ async fn active_timeline_of_active_tenant(
.map_err(ApiError::NotFound)
}
async fn always_panic_handler(
req: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -1086,10 +953,7 @@ async fn always_panic_handler(
json_response(StatusCode::NO_CONTENT, ())
}
async fn disk_usage_eviction_run(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -1179,10 +1043,8 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
)
}
async fn post_tracing_event_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
#[derive(Debug, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
enum Level {
@@ -1212,90 +1074,10 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
/// - Logs the request depending on the request method (by `request_span`)
/// - Logs the response if it was not successful (by `request_span`
/// - Shields the handler function from async cancellations. Hyper can drop the handler
/// Future if the connection to the client is lost, but most of the pageserver code is
/// not async cancellation safe. This converts the dropped future into a graceful cancellation
/// request with a CancellationToken.
async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
{
// Spawn a new task to handle the request, to protect the handler from unexpected
// async cancellations. Most pageserver functions are not async cancellation safe.
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
// with the cancellation token.
let token = CancellationToken::new();
let cancel_guard = token.clone().drop_guard();
let result = request_span(request, move |r| async {
let handle = tokio::spawn(
async {
let token_cloned = token.clone();
let result = handler(r, token).await;
if token_cloned.is_cancelled() {
info!("Cancelled request finished");
}
result
}
.in_current_span(),
);
match handle.await {
Ok(result) => result,
Err(e) => {
// The handler task panicked. We have a global panic handler that logs the
// panic with its backtrace, so no need to log that here. Only log a brief
// message to make it clear that we returned the error to the client.
error!("HTTP request handler task panicked: {e:#}");
// Don't return an Error here, because then fallback error handler that was
// installed in make_router() will print the error. Instead, construct the
// HTTP error response and return that.
Ok(
ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
.into_response(),
)
}
}
})
.await;
cancel_guard.disarm();
result
}
/// Like api_handler, but returns an error response if the server is built without
/// the 'testing' feature.
async fn testing_api_handler<R, H>(
desc: &str,
request: Request<Body>,
handler: H,
) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
{
if cfg!(feature = "testing") {
api_handler(request, handler).await
} else {
std::future::ready(Err(ApiError::BadRequest(anyhow!(
"Cannot {desc} because pageserver was compiled without testing APIs",
))))
.await
}
}
pub fn make_router(
conf: &'static PageServerConf,
launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<JwtAuth>>,
broker_client: BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
@@ -1320,99 +1102,121 @@ pub fn make_router(
.expect("construct launch timestamp header middleware"),
);
macro_rules! testing_api {
($handler_desc:literal, $handler:path $(,)?) => {{
#[cfg(not(feature = "testing"))]
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was compiled without testing APIs",
))))
}
#[cfg(feature = "testing")]
let handler = $handler;
#[cfg(not(feature = "testing"))]
let handler = cfg_disabled;
move |r| RequestSpan(handler).handle(r)
}};
}
Ok(router
.data(Arc::new(
State::new(
conf,
auth,
remote_storage,
broker_client,
disk_usage_eviction_state,
)
.context("Failed to initialize router state")?,
State::new(conf, auth, remote_storage, disk_usage_eviction_state)
.context("Failed to initialize router state")?,
))
.get("/v1/status", |r| api_handler(r, status_handler))
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
.put(
"/v1/failpoints",
testing_api!("manage failpoints", failpoints_handler),
)
.get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
.post("/v1/tenant", |r| {
RequestSpan(tenant_create_handler).handle(r)
})
.get("/v1/tenant/:tenant_id", |r| {
RequestSpan(tenant_status).handle(r)
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
RequestSpan(tenant_size_handler).handle(r)
})
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
RequestSpan(update_tenant_config_handler).handle(r)
})
.get("/v1/tenant/:tenant_id/config", |r| {
api_handler(r, get_tenant_config_handler)
RequestSpan(get_tenant_config_handler).handle(r)
})
.get("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_list_handler)
RequestSpan(timeline_list_handler).handle(r)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_create_handler)
RequestSpan(timeline_create_handler).handle(r)
})
.post("/v1/tenant/:tenant_id/attach", |r| {
api_handler(r, tenant_attach_handler)
RequestSpan(tenant_attach_handler).handle(r)
})
.post("/v1/tenant/:tenant_id/detach", |r| {
api_handler(r, tenant_detach_handler)
RequestSpan(tenant_detach_handler).handle(r)
})
.post("/v1/tenant/:tenant_id/load", |r| {
api_handler(r, tenant_load_handler)
RequestSpan(tenant_load_handler).handle(r)
})
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
RequestSpan(tenant_ignore_handler).handle(r)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
RequestSpan(timeline_detail_handler).handle(r)
})
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|r| api_handler(r, get_lsn_by_timestamp_handler),
|r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
)
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
api_handler(r, timeline_gc_handler)
})
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
RequestSpan(timeline_gc_handler).handle(r)
})
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
testing_api!("run timeline compaction", timeline_compact_handler),
)
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_post),
|r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
)
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
RequestSpan(timeline_delete_handler).handle(r)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
api_handler(r, layer_map_info_handler)
RequestSpan(layer_map_info_handler).handle(r)
})
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, layer_download_handler),
|r| RequestSpan(layer_download_handler).handle(r),
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler),
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
)
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
.post("/v1/tracing/event", |r| {
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
RequestSpan(disk_usage_eviction_run).handle(r)
})
.put(
"/v1/tenant/:tenant_id/break",
testing_api!("set tenant state to broken", handle_tenant_break),
)
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
.post(
"/v1/tracing/event",
testing_api!("emit a tracing event", post_tracing_event_handler),
)
.any(handler_404))
}

View File

@@ -62,6 +62,46 @@ impl KeySpace {
KeyPartitioning { parts }
}
///
/// Calculate logical size of delta layers: total size of all blocks covered by it's key range
///
pub fn get_logical_size(&self, range: &Range<Key>) -> u64 {
let mut start_key = range.start;
let n_ranges = self.ranges.len();
let start_index = match self.ranges.binary_search_by_key(&start_key, |r| r.start) {
Ok(index) => index, // keyspace range starts with start_key
Err(index) => {
if index != 0 && self.ranges[index - 1].end > start_key {
index - 1 // previous keyspace range overlaps with specified
} else if index == n_ranges {
return 0; // no intersection with specified range
} else {
start_key = self.ranges[index].start;
index
}
}
};
let mut size = 0u64;
for i in start_index..n_ranges {
if self.ranges[i].start >= range.end {
break;
}
let end_key = if self.ranges[i].end < range.end {
self.ranges[i].end
} else {
range.end
};
let n_blocks = key_range_size(&(start_key..end_key));
if n_blocks != u32::MAX {
size += n_blocks as u64 * BLCKSZ as u64;
}
if i + 1 < n_ranges {
start_key = self.ranges[i + 1].start;
}
}
size
}
///
/// Check if key space contains overlapping range
///

View File

@@ -1,5 +1,6 @@
mod auth;
pub mod basebackup;
pub mod broker_client;
pub mod config;
pub mod consumption_metrics;
pub mod context;
@@ -35,7 +36,7 @@ use tracing::info;
/// backwards-compatible changes to the metadata format.
pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const DEFAULT_PG_VERSION: u32 = 15;
pub const DEFAULT_PG_VERSION: u32 = 14;
// Magic constants used to identify different kinds of files
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -45,7 +46,6 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument]
pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -58,6 +58,12 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// the checkpoint and GC tasks.
tenant::mgr::shutdown_all_tenants().await;
// Stop syncing with remote storage.
//
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
// Should it?
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -132,29 +138,6 @@ pub fn is_uninit_mark(path: &Path) -> bool {
}
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///
/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
/// delaying is needed.
#[derive(Clone)]
pub struct InitializationOrder {
/// Each initial tenant load task carries this until completion.
pub initial_tenant_load: Option<utils::completion::Completion>,
/// Barrier for when we can start initial logical size calculations.
pub initial_logical_size_can_start: utils::completion::Barrier,
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
/// attempt. It is important to drop this once the attempt has completed.
pub initial_logical_size_attempt: utils::completion::Completion,
/// Barrier for when we can start any background jobs.
///
/// This can be broken up later on, but right now there is just one class of a background job.
pub background_jobs_can_start: utils::completion::Barrier,
}
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;

View File

@@ -8,7 +8,6 @@ use metrics::{
use once_cell::sync::Lazy;
use pageserver_api::models::TenantState;
use strum::VariantNames;
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::{TenantId, TimelineId};
/// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -25,33 +24,16 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
];
// Metrics collected on operations on the storage repository.
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub enum StorageTimeOperation {
#[strum(serialize = "layer flush")]
LayerFlush,
#[strum(serialize = "compact")]
Compact,
#[strum(serialize = "create images")]
CreateImages,
#[strum(serialize = "logical size")]
LogicalSize,
#[strum(serialize = "imitate logical size")]
ImitateLogicalSize,
#[strum(serialize = "load layer map")]
LoadLayerMap,
#[strum(serialize = "gc")]
Gc,
#[strum(serialize = "create tenant")]
CreateTenant,
}
const STORAGE_TIME_OPERATIONS: &[&str] = &[
"layer flush",
"compact",
"create images",
"init logical size",
"logical size",
"imitate logical size",
"load layer map",
"gc",
];
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
register_counter_vec!(
@@ -84,16 +66,6 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_read_num_fs_layers",
"Number of persistent layers accessed for processing a read request, including those in the cache",
&["tenant_id", "timeline_id"],
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
// Metrics collected on operations on the storage repository.
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
@@ -105,25 +77,6 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_getpage_get_reconstruct_data_seconds",
"Time spent in get_reconstruct_value_data",
&["tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_total",
@@ -383,7 +336,6 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
0.001000, // 1000 usec
0.030, // 30 ms
1.000, // 1000 ms
30.000, // 30000 ms
];
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -652,7 +604,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
"Time spent waiting for access to the WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
@@ -661,7 +613,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
"Histogram of number of records replayed per redo",
redo_histogram_count_buckets!(),
)
.expect("failed to define a metric")
@@ -670,7 +622,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_bytes_histogram",
"Histogram of number of records replayed per redo sent to Postgres",
"Histogram of number of records replayed per redo",
redo_bytes_histogram_count_buckets!(),
)
.expect("failed to define a metric")
@@ -720,9 +672,7 @@ pub struct StorageTimeMetrics {
}
impl StorageTimeMetrics {
pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
let operation: &'static str = operation.into();
pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
.unwrap();
@@ -753,9 +703,7 @@ pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub reconstruct_time_histo: Histogram,
pub get_reconstruct_data_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
@@ -766,7 +714,6 @@ pub struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub resident_physical_size_gauge: UIntGauge,
pub read_num_fs_layers: Histogram,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -786,29 +733,19 @@ impl TimelineMetrics {
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
let compact_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
let create_images_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
let logical_size_histo =
StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
let imitate_logical_size_histo = StorageTimeMetrics::new(
StorageTimeOperation::ImitateLogicalSize,
&tenant_id,
&timeline_id,
);
StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
let imitate_logical_size_histo =
StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id);
let load_layer_map_histo =
StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
let garbage_collect_histo =
StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
@@ -830,12 +767,6 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let read_num_fs_layers = READ_NUM_FS_LAYERS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
@@ -843,9 +774,7 @@ impl TimelineMetrics {
tenant_id,
timeline_id,
reconstruct_time_histo,
get_reconstruct_data_time_histo,
materialized_page_cache_hit_counter,
materialized_page_cache_hit_upon_request_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
@@ -863,7 +792,6 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
read_num_fs_layers,
}
}
}
@@ -873,9 +801,7 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -883,13 +809,11 @@ impl Drop for TimelineMetrics {
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
.unwrap()
.remove(tenant_id, timeline_id);
for op in StorageTimeOperation::VARIANTS {
for op in STORAGE_TIME_OPERATIONS {
let _ =
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
let _ =

View File

@@ -50,9 +50,7 @@ use crate::import_datadir::import_wal_from_tar;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant;
use crate::tenant::mgr;
use crate::tenant::mgr::GetTenantError;
use crate::tenant::{Tenant, Timeline};
use crate::trace::Tracer;
@@ -174,7 +172,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
///
pub async fn libpq_listener_main(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
listener: TcpListener,
auth_type: AuthType,
@@ -216,14 +213,7 @@ pub async fn libpq_listener_main(
None,
"serving compute connection task",
false,
page_service_conn_main(
conf,
broker_client.clone(),
local_auth,
socket,
auth_type,
connection_ctx,
),
page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
);
}
Err(err) => {
@@ -240,7 +230,6 @@ pub async fn libpq_listener_main(
async fn page_service_conn_main(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
@@ -277,7 +266,7 @@ async fn page_service_conn_main(
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend
@@ -335,7 +324,6 @@ impl PageRequestMetrics {
struct PageServerHandler {
_conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
claims: Option<Claims>,
@@ -349,13 +337,11 @@ struct PageServerHandler {
impl PageServerHandler {
pub fn new(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
connection_ctx: RequestContext,
) -> Self {
PageServerHandler {
_conf: conf,
broker_client,
auth,
claims: None,
connection_ctx,
@@ -508,12 +494,7 @@ impl PageServerHandler {
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
timeline
.import_basebackup_from_tar(
&mut copyin_reader,
base_lsn,
self.broker_client.clone(),
&ctx,
)
.import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
.await?;
// Read the end of the tar archive.
@@ -1150,9 +1131,7 @@ enum GetActiveTenantError {
wait_time: Duration,
},
#[error(transparent)]
NotFound(GetTenantError),
#[error(transparent)]
WaitTenantActive(tenant::WaitToBecomeActiveError),
Other(#[from] anyhow::Error),
}
impl From<GetActiveTenantError> for QueryError {
@@ -1161,8 +1140,7 @@ impl From<GetActiveTenantError> for QueryError {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
),
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
GetActiveTenantError::Other(e) => QueryError::Other(e),
}
}
}
@@ -1178,16 +1156,13 @@ async fn get_active_tenant_with_timeout(
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let tenant = match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => tenant,
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active=false")
}
Err(e) => return Err(GetActiveTenantError::Other(e.into())),
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
Ok(Ok(())) => Ok(tenant),
// no .context(), the error message is good enough and some tests depend on it
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
Err(_) => {
let latest_state = tenant.current_state();
if latest_state == TenantState::Active {
@@ -1202,34 +1177,13 @@ async fn get_active_tenant_with_timeout(
}
}
#[derive(Debug, thiserror::Error)]
enum GetActiveTimelineError {
#[error(transparent)]
Tenant(GetActiveTenantError),
#[error(transparent)]
Timeline(anyhow::Error),
}
impl From<GetActiveTimelineError> for QueryError {
fn from(e: GetActiveTimelineError) -> Self {
match e {
GetActiveTimelineError::Tenant(e) => e.into(),
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
}
}
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
) -> Result<Arc<Timeline>, GetActiveTenantError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
let timeline = tenant.get_timeline(timeline_id, true)?;
Ok(timeline)
}

View File

@@ -1600,7 +1600,9 @@ pub fn create_test_timeline(
pg_version: u32,
ctx: &RequestContext,
) -> anyhow::Result<std::sync::Arc<Timeline>> {
let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
let tline = tenant
.create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
.initialize(ctx)?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;
m.commit()?;

View File

@@ -476,35 +476,18 @@ pub async fn shutdown_tasks(
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push((
Arc::clone(task),
task.kind,
task_mut.tenant_id,
task_mut.timeline_id,
));
victim_tasks.push(Arc::clone(task));
}
}
}
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
for task in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
};
if let Some(mut join_handle) = join_handle {
if log_all {
if tenant_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
let completed = tokio::select! {
biased;
_ = &mut join_handle => { true },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log

File diff suppressed because it is too large Load Diff

View File

@@ -9,7 +9,7 @@
//! may lead to a data loss.
//!
use anyhow::Context;
use pageserver_api::models;
use pageserver_api::models::{TenantConfigRequest, TenantCreateRequest};
use serde::{Deserialize, Serialize};
use std::num::NonZeroU64;
use std::time::Duration;
@@ -99,7 +99,6 @@ pub struct TenantConf {
// See the corresponding metric's help string.
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -176,10 +175,6 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
#[serde(default)]
pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -247,7 +242,6 @@ impl TenantConfOpt {
evictions_low_residence_duration_metric_threshold: self
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
}
}
}
@@ -284,7 +278,6 @@ impl Default for TenantConf {
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
}
}
}
@@ -299,77 +292,93 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
move || format!("Cannot parse `{field_name}` duration {value:?}")
}
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
impl TenantConfOpt {
#[allow(clippy::too_many_arguments)]
fn from_request(
checkpoint_distance: Option<u64>,
checkpoint_timeout: &Option<String>,
compaction_target_size: Option<u64>,
compaction_period: &Option<String>,
compaction_threshold: Option<usize>,
gc_horizon: Option<u64>,
gc_period: &Option<String>,
image_creation_threshold: Option<usize>,
pitr_interval: &Option<String>,
walreceiver_connect_timeout: &Option<String>,
lagging_wal_timeout: &Option<String>,
max_lsn_wal_lag: Option<NonZeroU64>,
trace_read_requests: Option<bool>,
eviction_policy: &Option<serde_json::Value>,
min_resident_size_override: Option<u64>,
evictions_low_residence_duration_metric_threshold: &Option<String>,
) -> Result<Self, anyhow::Error> {
let mut tenant_conf = TenantConfOpt::default();
if let Some(gc_period) = &request_data.gc_period {
if let Some(gc_period) = &gc_period {
tenant_conf.gc_period = Some(
humantime::parse_duration(gc_period)
.with_context(bad_duration("gc_period", gc_period))?,
);
}
tenant_conf.gc_horizon = request_data.gc_horizon;
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
tenant_conf.gc_horizon = gc_horizon;
tenant_conf.image_creation_threshold = image_creation_threshold;
if let Some(pitr_interval) = &request_data.pitr_interval {
if let Some(pitr_interval) = &pitr_interval {
tenant_conf.pitr_interval = Some(
humantime::parse_duration(pitr_interval)
.with_context(bad_duration("pitr_interval", pitr_interval))?,
);
}
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout {
tenant_conf.walreceiver_connect_timeout = Some(
humantime::parse_duration(walreceiver_connect_timeout).with_context(
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
)?,
);
}
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
if let Some(lagging_wal_timeout) = &lagging_wal_timeout {
tenant_conf.lagging_wal_timeout = Some(
humantime::parse_duration(lagging_wal_timeout)
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
);
}
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
if let Some(max_lsn_wal_lag) = max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
if let Some(trace_read_requests) = trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
tenant_conf.checkpoint_distance = checkpoint_distance;
if let Some(checkpoint_timeout) = &checkpoint_timeout {
tenant_conf.checkpoint_timeout = Some(
humantime::parse_duration(checkpoint_timeout)
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
);
}
tenant_conf.compaction_target_size = request_data.compaction_target_size;
tenant_conf.compaction_threshold = request_data.compaction_threshold;
tenant_conf.compaction_target_size = compaction_target_size;
tenant_conf.compaction_threshold = compaction_threshold;
if let Some(compaction_period) = &request_data.compaction_period {
if let Some(compaction_period) = &compaction_period {
tenant_conf.compaction_period = Some(
humantime::parse_duration(compaction_period)
.with_context(bad_duration("compaction_period", compaction_period))?,
);
}
if let Some(eviction_policy) = &request_data.eviction_policy {
if let Some(eviction_policy) = &eviction_policy {
tenant_conf.eviction_policy = Some(
serde::Deserialize::deserialize(eviction_policy)
.context("parse field `eviction_policy`")?,
);
}
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
tenant_conf.min_resident_size_override = min_resident_size_override;
if let Some(evictions_low_residence_duration_metric_threshold) =
&request_data.evictions_low_residence_duration_metric_threshold
&evictions_low_residence_duration_metric_threshold
{
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
@@ -379,12 +388,61 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
))?,
);
}
tenant_conf.gc_feedback = request_data.gc_feedback;
Ok(tenant_conf)
}
}
impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
Self::from_request(
request_data.checkpoint_distance,
&request_data.checkpoint_timeout,
request_data.compaction_target_size,
&request_data.compaction_period,
request_data.compaction_threshold,
request_data.gc_horizon,
&request_data.gc_period,
request_data.image_creation_threshold,
&request_data.pitr_interval,
&request_data.walreceiver_connect_timeout,
&request_data.lagging_wal_timeout,
request_data.max_lsn_wal_lag,
request_data.trace_read_requests,
&request_data.eviction_policy,
request_data.min_resident_size_override,
&request_data.evictions_low_residence_duration_metric_threshold,
)
}
}
impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
Self::from_request(
request_data.checkpoint_distance,
&request_data.checkpoint_timeout,
request_data.compaction_target_size,
&request_data.compaction_period,
request_data.compaction_threshold,
request_data.gc_horizon,
&request_data.gc_period,
request_data.image_creation_threshold,
&request_data.pitr_interval,
&request_data.walreceiver_connect_timeout,
&request_data.lagging_wal_timeout,
request_data.max_lsn_wal_lag,
request_data.trace_read_requests,
&request_data.eviction_policy,
request_data.min_resident_size_override,
&request_data.evictions_low_residence_duration_metric_threshold,
)
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -51,9 +51,7 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use crate::tenant::storage_layer::Layer;
use anyhow::Context;
use anyhow::Result;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
@@ -63,8 +61,6 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::Replacement;
use super::storage_layer::range_eq;
use super::storage_layer::PersistentLayerDesc;
use super::storage_layer::PersistentLayerKey;
///
/// LayerMap tracks what layers exist on a timeline.
@@ -90,16 +86,11 @@ pub struct LayerMap<L: ?Sized> {
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
/// Index of the historic layers optimized for search
historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
historic: BufferedHistoricLayerCoverage<Arc<L>>,
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
/// RemoteLayer will be removed.
mapping: HashMap<PersistentLayerKey, Arc<L>>,
l0_delta_layers: Vec<Arc<L>>,
}
impl<L: ?Sized> Default for LayerMap<L> {
@@ -110,7 +101,6 @@ impl<L: ?Sized> Default for LayerMap<L> {
frozen_layers: VecDeque::default(),
l0_delta_layers: Vec::default(),
historic: BufferedHistoricLayerCoverage::default(),
mapping: HashMap::default(),
}
}
}
@@ -135,9 +125,8 @@ where
///
/// Insert an on-disk layer.
///
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.insert_historic_noflush(layer_desc, layer)
pub fn insert_historic(&mut self, layer: Arc<L>) {
self.layer_map.insert_historic_noflush(layer)
}
///
@@ -145,8 +134,8 @@ where
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.remove_historic_noflush(layer_desc, layer)
pub fn remove_historic(&mut self, layer: Arc<L>) {
self.layer_map.remove_historic_noflush(layer)
}
/// Replaces existing layer iff it is the `expected`.
@@ -161,15 +150,12 @@ where
/// that we can replace values only by updating a hashmap.
pub fn replace_historic(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
self.layer_map
.replace_historic_noflush(expected_desc, expected, new_desc, new)
self.layer_map.replace_historic_noflush(expected, new)
}
// We will flush on drop anyway, but this method makes it
@@ -244,7 +230,6 @@ where
(None, None) => None,
(None, Some(image)) => {
let lsn_floor = image.get_lsn_range().start;
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor,
@@ -252,7 +237,6 @@ where
}
(Some(delta), None) => {
let lsn_floor = delta.get_lsn_range().start;
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -263,7 +247,6 @@ where
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
let image_exact_match = img_lsn + 1 == end_lsn;
if image_is_newer || image_exact_match {
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor: img_lsn,
@@ -271,7 +254,6 @@ where
} else {
let lsn_floor =
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -291,33 +273,16 @@ where
///
/// Helper function for BatchedUpdates::insert_historic
///
/// TODO(chi): remove L generic so that we do not need to pass layer object.
pub(self) fn insert_historic_noflush(
&mut self,
layer_desc: PersistentLayerDesc,
layer: Arc<L>,
) {
self.mapping.insert(layer_desc.key(), layer.clone());
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
if Self::is_l0(&layer) {
self.l0_delta_layers.push(layer_desc.clone().into());
}
self.historic.insert(
historic_layer_coverage::LayerKey::from(&*layer),
layer_desc.into(),
Arc::clone(&layer),
);
}
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
let layer = self
.mapping
.get(key)
.with_context(|| format!("{key:?}"))
.expect("inconsistent layer mapping");
layer
if Self::is_l0(&layer) {
self.l0_delta_layers.push(layer);
}
}
///
@@ -325,16 +290,14 @@ where
///
/// Helper function for BatchedUpdates::remove_historic
///
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
self.historic
.remove(historic_layer_coverage::LayerKey::from(&*layer));
if Self::is_l0(&layer) {
let len_before = self.l0_delta_layers.len();
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
l0_delta_layers.retain(|other| {
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
});
self.l0_delta_layers = l0_delta_layers;
self.l0_delta_layers
.retain(|other| !Self::compare_arced_layers(other, &layer));
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
// vtable) pairs.
@@ -344,14 +307,11 @@ where
"failed to locate removed historic layer from l0_delta_layers"
);
}
self.mapping.remove(&layer_desc.key());
}
pub(self) fn replace_historic_noflush(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
let key = historic_layer_coverage::LayerKey::from(&**expected);
@@ -372,9 +332,10 @@ where
let l0_index = if expected_l0 {
// find the index in case replace worked, we need to replace that as well
let pos = self.l0_delta_layers.iter().position(|slot| {
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
});
let pos = self
.l0_delta_layers
.iter()
.position(|slot| Self::compare_arced_layers(slot, expected));
if pos.is_none() {
return Ok(Replacement::NotFound);
@@ -384,28 +345,16 @@ where
None
};
let new_desc = Arc::new(new_desc);
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
**existing == expected_desc
let replaced = self.historic.replace(&key, new.clone(), |existing| {
Self::compare_arced_layers(existing, expected)
});
if let Replacement::Replaced { .. } = &replaced {
self.mapping.remove(&expected_desc.key());
self.mapping.insert(new_desc.key(), new);
if let Some(index) = l0_index {
self.l0_delta_layers[index] = new_desc;
self.l0_delta_layers[index] = new;
}
}
let replaced = match replaced {
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
Replacement::NotFound => Replacement::NotFound,
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
Replacement::Unexpected(x) => {
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
}
};
Ok(replaced)
}
@@ -434,7 +383,7 @@ where
let start = key.start.to_i128();
let end = key.end.to_i128();
let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
let layer_covers = |layer: Option<Arc<L>>| match layer {
Some(layer) => layer.get_lsn_range().start >= lsn.start,
None => false,
};
@@ -455,9 +404,7 @@ where
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
self.historic
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
self.historic.iter()
}
///
@@ -489,24 +436,14 @@ where
// Loop through the change events and push intervals
for (change_key, change_val) in version.image_coverage.range(start..end) {
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
coverage.push((kr, current_val.take()));
current_key = change_key;
current_val = change_val.clone();
}
// Add the final interval
let kr = Key::from_i128(current_key)..Key::from_i128(end);
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
coverage.push((kr, current_val.take()));
Ok(coverage)
}
@@ -595,9 +532,7 @@ where
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath =
self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +555,7 @@ where
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
max_stacked_deltas = std::cmp::max(
@@ -773,11 +706,7 @@ where
/// Return all L0 delta layers
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
Ok(self
.l0_delta_layers
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
.collect())
Ok(self.l0_delta_layers.clone())
}
/// debugging function to print out the contents of the layer map
@@ -880,17 +809,12 @@ mod tests {
let layer = LayerDescriptor::from(layer);
// same skeletan construction; see scenario below
let not_found = Arc::new(layer.clone());
let new_version = Arc::new(layer);
let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
let new_version: Arc<dyn Layer> = Arc::new(layer);
let mut map = LayerMap::default();
let res = map.batch_update().replace_historic(
not_found.get_persistent_layer_desc(),
&not_found,
new_version.get_persistent_layer_desc(),
new_version,
);
let res = map.batch_update().replace_historic(&not_found, new_version);
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
}
@@ -899,8 +823,8 @@ mod tests {
let name = LayerFileName::from_str(layer_name).unwrap();
let skeleton = LayerDescriptor::from(name);
let remote = Arc::new(skeleton.clone());
let downloaded = Arc::new(skeleton);
let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
let mut map = LayerMap::default();
@@ -910,18 +834,12 @@ mod tests {
let expected_in_counts = (1, usize::from(expected_l0));
map.batch_update()
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
map.batch_update().insert_historic(remote.clone());
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
let replaced = map
.batch_update()
.replace_historic(
remote.get_persistent_layer_desc(),
&remote,
downloaded.get_persistent_layer_desc(),
downloaded.clone(),
)
.replace_historic(&remote, downloaded.clone())
.expect("name derived attributes are the same");
assert!(
matches!(replaced, Replacement::Replaced { .. }),
@@ -929,12 +847,11 @@ mod tests {
);
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
map.batch_update()
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
map.batch_update().remove_historic(downloaded.clone());
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
}
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
let historic = map
.iter_historic_layers()
.filter(|x| LayerMap::compare_arced_layers(x, layer))

View File

@@ -204,35 +204,6 @@ fn test_off_by_one() {
assert_eq!(version.image_coverage.query(5), None);
}
/// White-box regression test, checking for incorrect removal of node at key.end
#[test]
fn test_regression() {
let mut map = HistoricLayerCoverage::<String>::new();
map.insert(
LayerKey {
key: 0..5,
lsn: 0..5,
is_image: false,
},
"Layer 1".to_string(),
);
map.insert(
LayerKey {
key: 0..5,
lsn: 1..2,
is_image: false,
},
"Layer 2".to_string(),
);
// If an insertion operation improperly deletes the endpoint of a previous layer
// (which is more likely to happen with layers that collide on key.end), we will
// end up with an infinite layer, covering the entire keyspace. Here we assert
// that there's no layer at key 100 because we didn't insert any layer there.
let version = map.get_version(100).unwrap();
assert_eq!(version.delta_coverage.query(100), None);
}
/// Cover edge cases where layers begin or end on the same key
#[test]
fn test_key_collision() {

View File

@@ -1,8 +1,8 @@
use std::ops::Range;
// NOTE the `im` crate has 20x more downloads and also has
// persistent/immutable BTree. But it's bugged so rpds is a
// better choice https://github.com/neondatabase/neon/issues/3395
// TODO the `im` crate has 20x more downloads and also has
// persistent/immutable BTree. It also runs a bit faster but
// results are not the same on some tests.
use rpds::RedBlackTreeMapSync;
/// Data structure that can efficiently:
@@ -10,22 +10,19 @@ use rpds::RedBlackTreeMapSync;
/// - iterate the latest layers in a key range
/// - insert layers in non-decreasing lsn.start order
///
/// For a detailed explanation and justification of this approach, see:
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
///
/// NOTE The struct is parameterized over Value for easier
/// testing, but in practice it's some sort of layer.
/// The struct is parameterized over Value for easier
/// testing, but in practice it's some sort of layer.
pub struct LayerCoverage<Value> {
/// For every change in coverage (as we sweep the key space)
/// we store (lsn.end, value).
///
/// NOTE We use an immutable/persistent tree so that we can keep historic
/// versions of this coverage without cloning the whole thing and
/// incurring quadratic memory cost. See HistoricLayerCoverage.
/// We use an immutable/persistent tree so that we can keep historic
/// versions of this coverage without cloning the whole thing and
/// incurring quadratic memory cost. See HistoricLayerCoverage.
///
/// NOTE We use the Sync version of the map because we want Self to
/// be Sync. Using nonsync might be faster, if we can work with
/// that.
/// We use the Sync version of the map because we want Self to
/// be Sync. Using nonsync might be faster, if we can work with
/// that.
nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
}
@@ -44,13 +41,6 @@ impl<Value: Clone> LayerCoverage<Value> {
/// Helper function to subdivide the key range without changing any values
///
/// This operation has no semantic effect by itself. It only helps us pin in
/// place the part of the coverage we don't want to change when inserting.
///
/// As an analogy, think of a polygon. If you add a vertex along one of the
/// segments, the polygon is still the same, but it behaves differently when
/// we move or delete one of the other points.
///
/// Complexity: O(log N)
fn add_node(&mut self, key: i128) {
let value = match self.nodes.range(..=key).last() {
@@ -84,7 +74,7 @@ impl<Value: Clone> LayerCoverage<Value> {
let mut to_update = Vec::new();
let mut to_remove = Vec::new();
let mut prev_covered = false;
for (k, node) in self.nodes.range(key) {
for (k, node) in self.nodes.range(key.clone()) {
let needs_cover = match node {
None => true,
Some((h, _)) => h < &lsn.end,
@@ -97,8 +87,9 @@ impl<Value: Clone> LayerCoverage<Value> {
}
prev_covered = needs_cover;
}
// TODO check if the nodes inserted at key.start and key.end are safe
// to remove. It's fine to keep them but they could be redundant.
if !prev_covered {
to_remove.push(key.end);
}
for k in to_update {
self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
}

View File

@@ -10,7 +10,6 @@ use tokio::fs;
use anyhow::Context;
use once_cell::sync::Lazy;
use tokio::sync::RwLock;
use tokio::task::JoinSet;
use tracing::*;
use remote_storage::GenericRemoteStorage;
@@ -20,8 +19,8 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use crate::tenant::{Tenant, TenantState};
use crate::IGNORED_TENANT_FILE_NAME;
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
@@ -59,12 +58,10 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
#[instrument(skip_all)]
#[instrument(skip(conf, remote_storage))]
pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: InitializationOrder,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let tenants_dir = conf.tenants_path();
@@ -119,9 +116,7 @@ pub async fn init_tenant_mgr(
match schedule_local_tenant_processing(
conf,
&tenant_dir_path,
broker_client.clone(),
remote_storage.clone(),
Some(init_order.clone()),
&ctx,
) {
Ok(tenant) => {
@@ -155,9 +150,7 @@ pub async fn init_tenant_mgr(
pub fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_path: &Path,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
@@ -193,7 +186,7 @@ pub fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -211,14 +204,7 @@ pub fn schedule_local_tenant_processing(
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(
conf,
tenant_id,
broker_client,
remote_storage,
init_order,
ctx,
)
Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
};
Ok(tenant)
}
@@ -233,7 +219,6 @@ pub fn schedule_local_tenant_processing(
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument]
pub async fn shutdown_all_tenants() {
// Prevent new tenants from being created.
let tenants_to_shut_down = {
@@ -250,51 +235,39 @@ pub async fn shutdown_all_tenants() {
tenants_clone
}
TenantsMap::ShuttingDown(_) => {
// TODO: it is possible that detach and shutdown happen at the same time. as a
// result, during shutdown we do not wait for detach.
error!("already shutting down, this function isn't supposed to be called more than once");
return;
}
}
};
let mut join_set = JoinSet::new();
for (tenant_id, tenant) in tenants_to_shut_down {
join_set.spawn(
async move {
let freeze_and_flush = true;
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => debug!("tenant successfully stopped"),
Err(super::ShutdownError::AlreadyStopping) => {
warn!("tenant was already shutting down")
}
}
}
.instrument(info_span!("shutdown", %tenant_id)),
);
}
let mut panicked = 0;
while let Some(res) = join_set.join_next().await {
match res {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures");
}
Err(join_error) if join_error.is_panic() => {
// cannot really do anything, as this panic is likely a bug
panicked += 1;
}
Err(join_error) => {
warn!("unknown kind of JoinError: {join_error}");
}
let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
for (_, tenant) in tenants_to_shut_down {
if tenant.is_active() {
// updates tenant state, forbidding new GC and compaction iterations from starting
tenant.set_stopping();
tenants_to_freeze_and_flush.push(tenant);
}
}
if panicked > 0 {
warn!(panicked, "observed panicks while shutting down tenants");
// Shut down all existing walreceiver connections and stop accepting the new ones.
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
// Ok, no background tasks running anymore. Flush any remaining data in
// memory to disk.
//
// We assume that any incoming connections that might request pages from
// the tenant have already been terminated by the caller, so there
// should be no more activity in any of the repositories.
//
// On error, log it but continue with the shutdown for other tenants.
for tenant in tenants_to_freeze_and_flush {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.freeze_and_flush().await {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}
}
@@ -302,45 +275,31 @@ pub async fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
tenant_map_insert(tenant_id, |vacant_entry| {
// We're holding the tenants lock in write mode while doing local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
// and do the work in that state.
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
let crated_tenant_id = created_tenant.tenant_id();
anyhow::ensure!(
tenant_id == crated_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
);
vacant_entry.insert(Arc::clone(&created_tenant));
Ok(created_tenant)
}).await
}
#[derive(Debug, thiserror::Error)]
pub enum SetNewTenantConfigError {
#[error(transparent)]
GetTenant(#[from] GetTenantError),
#[error(transparent)]
Persist(anyhow::Error),
}
pub async fn set_new_tenant_config(
conf: &'static PageServerConf,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
) -> Result<(), TenantStateError> {
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_id, true).await?;
@@ -350,32 +309,23 @@ pub async fn set_new_tenant_config(
&tenant_config_path,
new_tenant_conf,
false,
)
.map_err(SetNewTenantConfigError::Persist)?;
)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum GetTenantError {
#[error("Tenant {0} not found")]
NotFound(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantId),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub async fn get_tenant(
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
) -> Result<Arc<Tenant>, TenantStateError> {
let m = TENANTS.read().await;
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
.ok_or(TenantStateError::NotFound(tenant_id))?;
if active_only && !tenant.is_active() {
Err(GetTenantError::NotActive(tenant_id))
Err(TenantStateError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
@@ -384,7 +334,7 @@ pub async fn get_tenant(
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("Tenant {0}")]
Tenant(#[from] GetTenantError),
Tenant(#[from] TenantStateError),
#[error("Timeline {0}")]
Timeline(#[from] crate::tenant::DeleteTimelineError),
@@ -449,11 +399,10 @@ pub async fn detach_tenant(
pub async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
tenant_map_insert(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
if tenant_ignore_mark.exists() {
@@ -461,14 +410,14 @@ pub async fn load_tenant(
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
}
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
Ok(new_tenant)
}).await?;
Ok(())
vacant_entry.insert(new_tenant);
Ok(())
}).await
}
pub async fn ignore_tenant(
@@ -517,36 +466,22 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
pub async fn attach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
tenant_conf: TenantConfOpt,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
// Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
let marker_file_exists = conf
.tenant_attaching_mark_file_path(&tenant_id)
.try_exists()
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
let attached_tenant_id = attached_tenant.tenant_id();
tenant_map_insert(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
anyhow::ensure!(
tenant_id == attached_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
!tenant_path.exists(),
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
);
Ok(attached_tenant)
let tenant =
Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?;
vacant_entry.insert(tenant);
Ok(())
})
.await?;
Ok(())
.await
}
#[derive(Debug, thiserror::Error)]
@@ -567,12 +502,12 @@ pub enum TenantMapInsertError {
///
/// NB: the closure should return quickly because the current implementation of tenants map
/// serializes access through an `RwLock`.
async fn tenant_map_insert<F>(
async fn tenant_map_insert<F, V>(
tenant_id: TenantId,
insert_fn: F,
) -> Result<Arc<Tenant>, TenantMapInsertError>
) -> Result<V, TenantMapInsertError>
where
F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
{
let mut guard = TENANTS.write().await;
let m = match &mut *guard {
@@ -585,11 +520,8 @@ where
tenant_id,
e.get().current_state(),
)),
hash_map::Entry::Vacant(v) => match insert_fn() {
Ok(tenant) => {
v.insert(tenant.clone());
Ok(tenant)
}
hash_map::Entry::Vacant(v) => match insert_fn(v) {
Ok(v) => Ok(v),
Err(e) => Err(TenantMapInsertError::Closure(e)),
},
}
@@ -610,26 +542,25 @@ where
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
let tenant = {
TENANTS
.write()
.await
.get(&tenant_id)
.cloned()
.ok_or(TenantStateError::NotFound(tenant_id))?
};
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => {}
Err(super::ShutdownError::AlreadyStopping) => {
return Err(TenantStateError::IsStopping(tenant_id))
{
let tenants_accessor = TENANTS.write().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => match tenant.current_state() {
TenantState::Attaching
| TenantState::Loading
| TenantState::Broken { .. }
| TenantState::Active => tenant.set_stopping(),
TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
},
None => return Err(TenantStateError::NotFound(tenant_id)),
}
}
// shutdown all tenant and timeline tasks: gc, compaction, page service)
// No new tasks will be started for this tenant because it's in `Stopping` state.
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
match tenant_cleanup
.await
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -645,7 +576,7 @@ where
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => {
tenant.set_broken(e.to_string()).await;
tenant.set_broken(e.to_string());
}
None => {
warn!("Tenant {tenant_id} got removed from memory");
@@ -711,6 +642,7 @@ pub async fn immediate_gc(
Ok(wait_task_done)
}
#[cfg(feature = "testing")]
pub async fn immediate_compact(
tenant_id: TenantId,
timeline_id: TimelineId,

View File

@@ -19,8 +19,14 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
Ok(())
}
fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
// TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
const PARALLEL_PATH_THRESHOLD: usize = 1;
if paths.len() <= PARALLEL_PATH_THRESHOLD {
for path in paths {
fsync_path(path)?;
}
return Ok(());
}
/// Use at most this number of threads.
/// Increasing this limit will
@@ -30,11 +36,11 @@ fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
let num_threads = paths.len().min(MAX_NUM_THREADS);
let next_path_idx = AtomicUsize::new(0);
std::thread::scope(|s| -> io::Result<()> {
crossbeam_utils::thread::scope(|s| -> io::Result<()> {
let mut handles = vec![];
// Spawn `num_threads - 1`, as the current thread is also a worker.
for _ in 1..num_threads {
handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
}
parallel_worker(paths, &next_path_idx)?;
@@ -45,41 +51,5 @@ fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
Ok(())
})
}
/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
if paths.len() == 1 {
fsync_path(&paths[0])?;
return Ok(());
}
fsync_in_thread_pool(paths)
}
/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
const MAX_CONCURRENT_FSYNC: usize = 64;
let mut next = paths.iter().peekable();
let mut js = tokio::task::JoinSet::new();
loop {
while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
let next = next.next().expect("just peeked");
let next = next.to_owned();
js.spawn_blocking(move || fsync_path(&next));
}
// now the joinset has been filled up, wait for next to complete
if let Some(res) = js.join_next().await {
res??;
} else {
// last item had already completed
assert!(
next.peek().is_none(),
"joinset emptied, we shouldn't have more work"
);
return Ok(());
}
}
.unwrap()
}

View File

@@ -1264,7 +1264,9 @@ mod tests {
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = runtime.block_on(harness.load());
// create an empty timeline directory
let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
let timeline =
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
let _ = timeline.initialize(&ctx).unwrap();
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;

View File

@@ -4,7 +4,6 @@ pub mod delta_layer;
mod filename;
mod image_layer;
mod inmemory_layer;
mod layer_desc;
mod remote_layer;
use crate::config::PageServerConf;
@@ -38,7 +37,6 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use remote_layer::RemoteLayer;
use super::layer_map::BatchedUpdates;
@@ -408,23 +406,14 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
/// An image layer is a snapshot of all the data in a key-range, at a single
/// LSN.
pub trait PersistentLayer: Layer {
/// Get the layer descriptor.
fn layer_desc(&self) -> &PersistentLayerDesc;
fn get_tenant_id(&self) -> TenantId {
self.layer_desc().tenant_id
}
fn get_tenant_id(&self) -> TenantId;
/// Identify the timeline this layer belongs to
fn get_timeline_id(&self) -> TimelineId {
self.layer_desc().timeline_id
}
fn get_timeline_id(&self) -> TimelineId;
/// File name used for this layer, both in the pageserver's local filesystem
/// state as well as in the remote storage.
fn filename(&self) -> LayerFileName {
self.layer_desc().filename()
}
fn filename(&self) -> LayerFileName;
// Path to the layer file in the local filesystem.
// `None` for `RemoteLayer`.
@@ -454,9 +443,7 @@ pub trait PersistentLayer: Layer {
///
/// Should not change over the lifetime of the layer object because
/// current_physical_size is computed as the som of this value.
fn file_size(&self) -> u64 {
self.layer_desc().file_size
}
fn file_size(&self) -> u64;
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
@@ -485,20 +472,6 @@ pub struct LayerDescriptor {
pub short_id: String,
}
impl LayerDescriptor {
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
/// and the tenant / timeline id does not matter.
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
PersistentLayerDesc::new_delta(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
self.key.clone(),
self.lsn.clone(),
233,
)
}
}
impl Layer for LayerDescriptor {
fn get_key_range(&self) -> Range<Key> {
self.key.clone()
@@ -569,7 +542,7 @@ impl From<LayerFileName> for LayerDescriptor {
///
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
/// global config, and paths to layer files are constructed using the tenant/timeline
/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
/// struct for a file on disk, without having a page server running, so that we have no
/// config. In that case, we use the Path variant to hold the full path to the file on
/// disk.

View File

@@ -56,8 +56,8 @@ use utils::{
};
use super::{
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
PathOrConf, PersistentLayerDesc,
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
LayerKeyIter, PathOrConf,
};
///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
magic: DELTA_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
lsn_range: layer.desc.lsn_range.clone(),
tenant_id: layer.tenant_id,
timeline_id: layer.timeline_id,
key_range: layer.key_range.clone(),
lsn_range: layer.lsn_range.clone(),
index_start_blk: 0,
index_root_blk: 0,
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
/// reading/deserializing records themselves.
///
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
pub struct BlobRef(pub u64);
struct BlobRef(u64);
impl BlobRef {
pub fn will_init(&self) -> bool {
@@ -180,7 +180,12 @@ impl DeltaKey {
pub struct DeltaLayer {
path_or_conf: PathOrConf,
pub desc: PersistentLayerDesc,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub file_size: u64,
access_stats: LayerAccessStats,
@@ -192,9 +197,9 @@ impl std::fmt::Debug for DeltaLayer {
use super::RangeDisplayDebug;
f.debug_struct("DeltaLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("lsn_range", &self.desc.lsn_range)
.field("file_size", &self.desc.file_size)
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn_range", &self.lsn_range)
.field("file_size", &self.file_size)
.field("inner", &self.inner)
.finish()
}
@@ -223,16 +228,30 @@ impl std::fmt::Debug for DeltaLayerInner {
}
impl Layer for DeltaLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn is_incremental(&self) -> bool {
true
}
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.desc.lsn_range.start,
self.desc.lsn_range.end
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
if !verbose {
@@ -305,10 +324,10 @@ impl Layer for DeltaLayer {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.desc.lsn_range.start);
ensure!(lsn_range.start >= self.lsn_range.start);
let mut need_image = true;
ensure!(self.desc.key_range.contains(&key));
ensure!(self.key_range.contains(&key));
{
// Open the file and lock the metadata in memory
@@ -383,31 +402,19 @@ impl Layer for DeltaLayer {
Ok(ValueReconstructResult::Complete)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
}
}
impl PersistentLayer for DeltaLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
@@ -437,6 +444,10 @@ impl PersistentLayer for DeltaLayer {
Ok(())
}
fn file_size(&self) -> u64 {
self.file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
@@ -445,7 +456,7 @@ impl PersistentLayer for DeltaLayer {
HistoricLayerInfo::Delta {
layer_file_name,
layer_file_size: self.desc.file_size,
layer_file_size: self.file_size,
lsn_start: lsn_range.start,
lsn_end: lsn_range.end,
remote: false,
@@ -591,13 +602,11 @@ impl DeltaLayer {
) -> DeltaLayer {
DeltaLayer {
path_or_conf: PathOrConf::Conf(conf),
desc: PersistentLayerDesc::new_delta(
tenant_id,
timeline_id,
filename.key_range.clone(),
filename.lsn_range.clone(),
file_size,
),
timeline_id,
tenant_id,
key_range: filename.key_range.clone(),
lsn_range: filename.lsn_range.clone(),
file_size,
access_stats,
inner: RwLock::new(DeltaLayerInner {
loaded: false,
@@ -610,7 +619,7 @@ impl DeltaLayer {
/// Create a DeltaLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
@@ -623,13 +632,11 @@ impl DeltaLayer {
Ok(DeltaLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
desc: PersistentLayerDesc::new_delta(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn_range,
metadata.len(),
),
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn_range: summary.lsn_range,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
@@ -641,14 +648,18 @@ impl DeltaLayer {
}
fn layer_name(&self) -> DeltaFileName {
self.desc.delta_file_name()
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timeline_id,
self.tenant_id,
&self.layer_name(),
)
}
@@ -792,13 +803,11 @@ impl DeltaLayerWriterInner {
// set inner.file here. The first read will have to re-open it.
let layer = DeltaLayer {
path_or_conf: PathOrConf::Conf(self.conf),
desc: PersistentLayerDesc::new_delta(
self.tenant_id,
self.timeline_id,
self.key_start..key_end,
self.lsn_range.clone(),
metadata.len(),
),
tenant_id: self.tenant_id,
timeline_id: self.timeline_id,
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,

View File

@@ -9,8 +9,6 @@ use std::str::FromStr;
use utils::lsn::Lsn;
use super::PersistentLayerDesc;
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct DeltaFileName {
@@ -155,7 +153,7 @@ impl Ord for ImageFileName {
impl ImageFileName {
pub fn lsn_as_range(&self) -> Range<Lsn> {
// Saves from having to copypaste this all over
PersistentLayerDesc::image_layer_lsn_range(self.lsn)
self.lsn..(self.lsn + 1)
}
}

View File

@@ -52,8 +52,8 @@ use utils::{
lsn::Lsn,
};
use super::filename::ImageFileName;
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
use super::filename::{ImageFileName, LayerFileName};
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
///
/// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
Self {
magic: IMAGE_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
tenant_id: layer.tenant_id,
timeline_id: layer.timeline_id,
key_range: layer.key_range.clone(),
lsn: layer.lsn,
index_start_blk: 0,
@@ -104,9 +104,12 @@ impl From<&ImageLayer> for Summary {
/// and it needs to be loaded before using it in queries.
pub struct ImageLayer {
path_or_conf: PathOrConf,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub file_size: u64,
pub desc: PersistentLayerDesc,
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
// This entry contains an image of all pages as of this LSN
pub lsn: Lsn,
access_stats: LayerAccessStats,
@@ -119,8 +122,8 @@ impl std::fmt::Debug for ImageLayer {
use super::RangeDisplayDebug;
f.debug_struct("ImageLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("file_size", &self.desc.file_size)
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("file_size", &self.file_size)
.field("lsn", &self.lsn)
.field("inner", &self.inner)
.finish()
@@ -150,15 +153,27 @@ impl std::fmt::Debug for ImageLayerInner {
}
impl Layer for ImageLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
// End-bound is exclusive
self.lsn..(self.lsn + 1)
}
fn is_incremental(&self) -> bool {
false
}
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- image layer for ten {} tli {} key {}-{} at {} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.lsn
self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
);
if !verbose {
@@ -188,7 +203,7 @@ impl Layer for ImageLayer {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
assert!(self.desc.key_range.contains(&key));
assert!(self.key_range.contains(&key));
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
@@ -215,37 +230,24 @@ impl Layer for ImageLayer {
Ok(ValueReconstructResult::Missing)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
}
}
impl PersistentLayer for ImageLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
unimplemented!();
}
@@ -256,13 +258,17 @@ impl PersistentLayer for ImageLayer {
Ok(())
}
fn file_size(&self) -> u64 {
self.file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
HistoricLayerInfo::Image {
layer_file_name,
layer_file_size: self.desc.file_size,
layer_file_size: self.file_size,
lsn_start: lsn_range.start,
remote: false,
access_stats: self.access_stats.as_api_model(reset),
@@ -399,15 +405,11 @@ impl ImageLayer {
) -> ImageLayer {
ImageLayer {
path_or_conf: PathOrConf::Conf(conf),
desc: PersistentLayerDesc::new_img(
tenant_id,
timeline_id,
filename.key_range.clone(),
filename.lsn,
false,
file_size,
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
timeline_id,
tenant_id,
key_range: filename.key_range.clone(),
lsn: filename.lsn,
file_size,
access_stats,
inner: RwLock::new(ImageLayerInner {
loaded: false,
@@ -420,7 +422,7 @@ impl ImageLayer {
/// Create an ImageLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
@@ -431,15 +433,11 @@ impl ImageLayer {
.context("get file metadata to determine size")?;
Ok(ImageLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
desc: PersistentLayerDesc::new_img(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn,
false,
metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn: summary.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(ImageLayerInner {
file: None,
@@ -451,15 +449,18 @@ impl ImageLayer {
}
fn layer_name(&self) -> ImageFileName {
self.desc.image_file_name()
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn,
}
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timeline_id,
self.tenant_id,
&self.layer_name(),
)
}
@@ -483,7 +484,6 @@ struct ImageLayerWriterInner {
tenant_id: TenantId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
blob_writer: WriteBlobWriter<VirtualFile>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -499,7 +499,6 @@ impl ImageLayerWriterInner {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
@@ -534,7 +533,6 @@ impl ImageLayerWriterInner {
lsn,
tree: tree_builder,
blob_writer,
is_incremental,
};
Ok(writer)
@@ -590,22 +588,16 @@ impl ImageLayerWriterInner {
.metadata()
.context("get metadata to determine file size")?;
let desc = PersistentLayerDesc::new_img(
self.tenant_id,
self.timeline_id,
self.key_range.clone(),
self.lsn,
self.is_incremental, // for now, image layer ALWAYS covers the full range
metadata.len(),
);
// Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
let layer = ImageLayer {
path_or_conf: PathOrConf::Conf(self.conf),
desc,
timeline_id: self.timeline_id,
tenant_id: self.tenant_id,
key_range: self.key_range.clone(),
lsn: self.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
@@ -675,7 +667,6 @@ impl ImageLayerWriter {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(ImageLayerWriterInner::new(
@@ -684,7 +675,6 @@ impl ImageLayerWriter {
tenant_id,
key_range,
lsn,
is_incremental,
)?),
})
}

View File

@@ -1,176 +0,0 @@
use anyhow::Result;
use std::ops::Range;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{context::RequestContext, repository::Key};
use super::{DeltaFileName, ImageFileName, LayerFileName};
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
/// For image layer, this is `[lsn, lsn+1)`.
pub lsn_range: Range<Lsn>,
/// Whether this is a delta layer.
pub is_delta: bool,
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
/// incremental.
pub is_incremental: bool,
/// File size
pub file_size: u64,
}
/// A unique identifier of a persistent layer within the context of one timeline.
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct PersistentLayerKey {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub is_delta: bool,
}
impl PersistentLayerDesc {
pub fn key(&self) -> PersistentLayerKey {
PersistentLayerKey {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
is_delta: self.is_delta,
}
}
pub fn short_id(&self) -> String {
self.filename().file_name()
}
pub fn new_img(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
file_size: u64,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range: Self::image_layer_lsn_range(lsn),
is_delta: false,
is_incremental,
file_size,
}
}
pub fn new_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
file_size: u64,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range,
is_delta: true,
is_incremental: true,
file_size,
}
}
/// Get the LSN that the image layer covers.
pub fn image_layer_lsn(&self) -> Lsn {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
self.lsn_range.start
}
/// Get the LSN range corresponding to a single image layer LSN.
pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
lsn..(lsn + 1)
}
/// Get a delta file name for this layer.
///
/// Panic: if this is not a delta layer.
pub fn delta_file_name(&self) -> DeltaFileName {
assert!(self.is_delta);
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Get a delta file name for this layer.
///
/// Panic: if this is not an image layer, or the lsn range is invalid
pub fn image_file_name(&self) -> ImageFileName {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
}
pub fn filename(&self) -> LayerFileName {
if self.is_delta {
self.delta_file_name().into()
} else {
self.image_file_name().into()
}
}
// TODO: remove this in the future once we refactor timeline APIs.
pub fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
pub fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
pub fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
pub fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
pub fn is_incremental(&self) -> bool {
self.is_incremental
}
pub fn is_delta(&self) -> bool {
self.is_delta
}
pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
Ok(())
}
pub fn file_size(&self) -> u64 {
self.file_size
}
}

View File

@@ -18,10 +18,11 @@ use utils::{
lsn::Lsn,
};
use super::filename::{DeltaFileName, ImageFileName};
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
use super::image_layer::ImageLayer;
use super::{
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
LayerResidenceStatus, PersistentLayer,
};
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -33,10 +34,19 @@ use super::{
///
/// See: [`crate::context::RequestContext`] for authorization to download
pub struct RemoteLayer {
pub desc: PersistentLayerDesc,
tenantid: TenantId,
timelineid: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
pub file_name: LayerFileName,
pub layer_metadata: LayerFileMetadata,
is_delta: bool,
is_incremental: bool,
access_stats: LayerAccessStats,
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -56,14 +66,22 @@ pub struct RemoteLayer {
impl std::fmt::Debug for RemoteLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RemoteLayer")
.field("file_name", &self.desc.filename())
.field("file_name", &self.file_name)
.field("layer_metadata", &self.layer_metadata)
.field("is_incremental", &self.desc.is_incremental)
.field("is_incremental", &self.is_incremental)
.finish()
}
}
impl Layer for RemoteLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn get_value_reconstruct_data(
&self,
_key: Key,
@@ -77,45 +95,53 @@ impl Layer for RemoteLayer {
);
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
/// debugging function to print out the contents of the layer
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.desc.lsn_range.start,
self.desc.lsn_range.end
self.tenantid,
self.timelineid,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
Ok(())
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
self.filename().file_name()
}
}
impl PersistentLayer for RemoteLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn get_tenant_id(&self) -> TenantId {
self.tenantid
}
fn get_timeline_id(&self) -> TimelineId {
self.timelineid
}
fn filename(&self) -> LayerFileName {
if self.is_delta {
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
.into()
} else {
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
.into()
}
}
fn local_path(&self) -> Option<PathBuf> {
@@ -142,11 +168,15 @@ impl PersistentLayer for RemoteLayer {
true
}
fn file_size(&self) -> u64 {
self.layer_metadata.file_size()
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
if self.desc.is_delta {
if self.is_delta {
HistoricLayerInfo::Delta {
layer_file_name,
layer_file_size: self.layer_metadata.file_size(),
@@ -180,14 +210,13 @@ impl RemoteLayer {
access_stats: LayerAccessStats,
) -> RemoteLayer {
RemoteLayer {
desc: PersistentLayerDesc::new_img(
tenantid,
timelineid,
fname.key_range.clone(),
fname.lsn,
false,
layer_metadata.file_size(),
),
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn_as_range(),
is_delta: false,
is_incremental: false,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -203,13 +232,13 @@ impl RemoteLayer {
access_stats: LayerAccessStats,
) -> RemoteLayer {
RemoteLayer {
desc: PersistentLayerDesc::new_delta(
tenantid,
timelineid,
fname.key_range.clone(),
fname.lsn_range.clone(),
layer_metadata.file_size(),
),
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn_range.clone(),
is_delta: true,
is_incremental: true,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -227,12 +256,15 @@ impl RemoteLayer {
where
L: ?Sized + Layer,
{
if self.desc.is_delta {
let fname = self.desc.delta_file_name();
if self.is_delta {
let fname = DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
};
Arc::new(DeltaLayer::new(
conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timelineid,
self.tenantid,
&fname,
file_size,
self.access_stats.clone_for_residence_change(
@@ -241,11 +273,14 @@ impl RemoteLayer {
),
))
} else {
let fname = self.desc.image_file_name();
let fname = ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
};
Arc::new(ImageLayer::new(
conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timelineid,
self.tenantid,
&fname,
file_size,
self.access_stats.clone_for_residence_change(

View File

@@ -9,17 +9,13 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::mgr;
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::completion;
use utils::id::TenantId;
/// Start per tenant background loops: compaction and gc.
pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_id = tenant.tenant_id;
pub fn start_background_loops(tenant_id: TenantId) {
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,
@@ -27,20 +23,11 @@ pub fn start_background_loops(
None,
&format!("compactor for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
compaction_loop(tenant, cancel)
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
.await;
Ok(())
}
async move {
compaction_loop(tenant_id)
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
.await;
Ok(())
},
);
task_mgr::spawn(
@@ -50,20 +37,11 @@ pub fn start_background_loops(
None,
&format!("garbage collector for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
.await;
Ok(())
}
async move {
gc_loop(tenant_id)
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
.await;
Ok(())
},
);
}
@@ -71,26 +49,27 @@ pub fn start_background_loops(
///
/// Compaction task's main loop
///
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
async fn compaction_loop(tenant_id: TenantId) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
tokio::select! {
let tenant = tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request");
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
ControlFlow::Continue(tenant) => tenant,
},
}
};
let period = tenant.get_compaction_period();
@@ -140,29 +119,29 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
///
/// GC task's main loop
///
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
async fn gc_loop(tenant_id: TenantId) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
// GC might require downloading, to find the cutoff LSN that corresponds to the
// cutoff specified as time.
let ctx =
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
tokio::select! {
let tenant = tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request");
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
ControlFlow::Continue(tenant) => tenant,
},
}
};
let period = tenant.get_gc_period();
@@ -182,9 +161,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
Duration::from_secs(10)
} else {
// Run gc
let res = tenant
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
.await;
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
if let Err(e) = res {
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
@@ -210,10 +187,23 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
trace!("GC loop stopped.");
}
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
async fn wait_for_active_tenant(
tenant_id: TenantId,
wait: Duration,
) -> ControlFlow<(), Arc<Tenant>> {
let tenant = loop {
match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => break tenant,
Err(e) => {
error!("Failed to get a tenant {tenant_id}: {e:#}");
tokio::time::sleep(wait).await;
}
}
};
// if the tenant has a proper status already, no need to wait for anything
if tenant.current_state() == TenantState::Active {
ControlFlow::Continue(())
ControlFlow::Continue(tenant)
} else {
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
loop {
@@ -223,7 +213,7 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
match new_state {
TenantState::Active => {
debug!("Tenant state changed to active, continuing the task loop");
return ControlFlow::Continue(());
return ControlFlow::Continue(tenant);
}
state => {
debug!("Not running the task loop, tenant is not active: {state:?}");

View File

@@ -31,6 +31,7 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
use crate::broker_client::{get_broker_client, is_broker_client_initialized};
use crate::context::{DownloadBehavior, RequestContext};
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
use crate::tenant::storage_layer::{
@@ -57,7 +58,6 @@ use pageserver_api::reltag::RelTag;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::to_pg_timestamp;
use utils::{
completion,
id::{TenantId, TimelineId},
lsn::{AtomicLsn, Lsn, RecordLsn},
seqwait::SeqWait,
@@ -196,9 +196,8 @@ pub struct Timeline {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
/// and [`Tenant::delete_timeline`].
pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -228,7 +227,7 @@ pub struct Timeline {
/// or None if WAL receiver has not received anything for this timeline
/// yet.
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
pub walreceiver: Mutex<Option<WalReceiver>>,
pub walreceiver: WalReceiver,
/// Relation size cache
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
@@ -237,18 +236,7 @@ pub struct Timeline {
state: watch::Sender<TimelineState>,
/// Prevent two tasks from deleting the timeline at the same time. If held, the
/// timeline is being deleted. If 'true', the timeline has already been deleted.
pub delete_lock: tokio::sync::Mutex<bool>,
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
/// Barrier to wait before doing initial logical size calculation. Used only during startup.
initial_logical_size_can_start: Option<completion::Barrier>,
/// Completion shared between all timelines loaded during startup; used to delay heavier
/// background tasks until some logical sizes have been calculated.
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
}
/// Internal structure to hold all data needed for logical size calculation.
@@ -533,12 +521,7 @@ impl Timeline {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
Ordering::Equal => {
self.metrics
.materialized_page_cache_hit_upon_request_counter
.inc();
return Ok(cached_img); // exact LSN match, return the image
}
Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
Ordering::Greater => {
unreachable!("the returned lsn should never be after the requested lsn")
}
@@ -553,10 +536,8 @@ impl Timeline {
img: cached_page_img,
};
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
self.metrics
.reconstruct_time_histo
@@ -641,27 +622,17 @@ impl Timeline {
.await
{
Ok(()) => Ok(()),
Err(e) => {
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
seqwait_error => {
drop(_timer);
let walreceiver_status = {
match &*self.walreceiver.lock().unwrap() {
None => "stopping or stopped".to_string(),
Some(walreceiver) => match walreceiver.status() {
Some(status) => status.to_human_readable_string(),
None => "Not active".to_string(),
},
}
};
Err(anyhow::Error::new(e).context({
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
lsn,
self.get_last_record_lsn(),
self.get_disk_consistent_lsn(),
walreceiver_status,
)
}))
let walreceiver_status = self.walreceiver.status().await;
seqwait_error.with_context(|| format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
lsn,
self.get_last_record_lsn(),
self.get_disk_consistent_lsn(),
walreceiver_status.map(|status| status.to_human_readable_string())
.unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
))
}
}
}
@@ -689,7 +660,7 @@ impl Timeline {
}
/// Outermost timeline compaction operation; downloads needed layers.
pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
const ROUNDS: usize = 2;
let last_record_lsn = self.get_last_record_lsn();
@@ -778,7 +749,7 @@ impl Timeline {
}
/// Compaction which might need to be retried after downloading remote layers.
async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
//
// High level strategy for compaction / image creation:
//
@@ -813,7 +784,7 @@ impl Timeline {
// Below are functions compact_level0() and create_image_layers()
// but they are a bit ad hoc and don't quite work like it's explained
// above. Rewrite it.
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_removal_cs.lock().await;
// Is the timeline being deleted?
let state = *self.state.borrow();
if state == TimelineState::Stopping {
@@ -847,7 +818,7 @@ impl Timeline {
// 3. Compact
let timer = self.metrics.compact_time_histo.start_timer();
self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
.await?;
timer.stop_and_record();
}
@@ -936,15 +907,18 @@ impl Timeline {
Ok(())
}
pub fn activate(
self: &Arc<Self>,
broker_client: BrokerClientChannel,
background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext,
) {
self.launch_wal_receiver(ctx, broker_client);
pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
if is_broker_client_initialized() {
self.launch_wal_receiver(ctx, get_broker_client().clone())?;
} else if cfg!(test) {
info!("not launching WAL receiver because broker client hasn't been initialized");
} else {
anyhow::bail!("broker client not initialized");
}
self.set_state(TimelineState::Active);
self.launch_eviction_task(background_jobs_can_start);
self.launch_eviction_task();
Ok(())
}
pub fn set_state(&self, new_state: TimelineState) {
@@ -962,14 +936,6 @@ impl Timeline {
error!("Not activating a Stopping timeline");
}
(_, new_state) => {
if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) {
// drop the copmletion guard, if any; it might be holding off the completion
// forever needlessly
self.initial_logical_size_attempt
.lock()
.unwrap_or_else(|e| e.into_inner())
.take();
}
self.state.send_replace(new_state);
}
}
@@ -989,7 +955,7 @@ impl Timeline {
pub async fn wait_to_become_active(
&self,
_ctx: &RequestContext, // Prepare for use by cancellation
_ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), TimelineState> {
let mut receiver = self.state.subscribe();
loop {
@@ -1211,12 +1177,7 @@ impl Timeline {
),
});
let replaced = match batch_updates.replace_historic(
local_layer.layer_desc().clone(),
local_layer,
new_remote_layer.layer_desc().clone(),
new_remote_layer,
)? {
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
Replacement::Replaced { .. } => {
if let Err(e) = local_layer.delete_resident_layer_file() {
error!("failed to remove layer file on evict after replacement: {e:#?}");
@@ -1324,13 +1285,6 @@ impl Timeline {
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
fn get_gc_feedback(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.gc_feedback
.unwrap_or(self.conf.default_tenant_conf.gc_feedback)
}
pub(super) fn tenant_conf_updated(&self) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
@@ -1365,8 +1319,6 @@ impl Timeline {
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
remote_client: Option<RemoteTimelineClient>,
pg_version: u32,
initial_logical_size_can_start: Option<completion::Barrier>,
initial_logical_size_attempt: Option<completion::Completion>,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(TimelineState::Loading);
@@ -1375,7 +1327,15 @@ impl Timeline {
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let tenant_conf_guard = tenant_conf.read().unwrap();
let wal_connect_timeout = tenant_conf_guard
.walreceiver_connect_timeout
.unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
let lagging_wal_timeout = tenant_conf_guard
.lagging_wal_timeout
.unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
let max_lsn_wal_lag = tenant_conf_guard
.max_lsn_wal_lag
.unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
let evictions_low_residence_duration_metric_threshold =
Self::get_evictions_low_residence_duration_metric_threshold(
&tenant_conf_guard,
@@ -1384,6 +1344,18 @@ impl Timeline {
drop(tenant_conf_guard);
Arc::new_cyclic(|myself| {
let walreceiver = WalReceiver::new(
TenantTimelineId::new(tenant_id, timeline_id),
Weak::clone(myself),
WalReceiverConf {
wal_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: conf.availability_zone.clone(),
},
);
let mut result = Timeline {
conf,
tenant_conf,
@@ -1395,7 +1367,7 @@ impl Timeline {
wanted_image_layers: Mutex::new(None),
walredo_mgr,
walreceiver: Mutex::new(None),
walreceiver,
remote_client: remote_client.map(Arc::new),
@@ -1460,10 +1432,6 @@ impl Timeline {
eviction_task_timeline_state: tokio::sync::Mutex::new(
EvictionTaskTimelineState::default(),
),
delete_lock: tokio::sync::Mutex::new(false),
initial_logical_size_can_start,
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
@@ -1519,49 +1487,17 @@ impl Timeline {
*flush_loop_state = FlushLoopState::Running;
}
/// Creates and starts the wal receiver.
///
/// This function is expected to be called at most once per Timeline's lifecycle
/// when the timeline is activated.
fn launch_wal_receiver(
self: &Arc<Self>,
pub(super) fn launch_wal_receiver(
&self,
ctx: &RequestContext,
broker_client: BrokerClientChannel,
) {
) -> anyhow::Result<()> {
info!(
"launching WAL receiver for timeline {} of tenant {}",
self.timeline_id, self.tenant_id
);
let tenant_conf_guard = self.tenant_conf.read().unwrap();
let wal_connect_timeout = tenant_conf_guard
.walreceiver_connect_timeout
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
let lagging_wal_timeout = tenant_conf_guard
.lagging_wal_timeout
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
let max_lsn_wal_lag = tenant_conf_guard
.max_lsn_wal_lag
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
drop(tenant_conf_guard);
let mut guard = self.walreceiver.lock().unwrap();
assert!(
guard.is_none(),
"multiple launches / re-launches of WAL receiver are not supported"
);
*guard = Some(WalReceiver::start(
Arc::clone(self),
WalReceiverConf {
wal_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: self.conf.availability_zone.clone(),
},
broker_client,
ctx,
));
self.walreceiver.start(ctx, broker_client)?;
Ok(())
}
///
@@ -1612,7 +1548,7 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
updates.insert_historic(Arc::new(layer));
num_layers += 1;
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
@@ -1644,7 +1580,7 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
updates.insert_historic(Arc::new(layer));
num_layers += 1;
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
@@ -1743,7 +1679,7 @@ impl Timeline {
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.resident_physical_size_gauge.sub(local_size);
updates.remove_historic(local_layer.layer_desc().clone(), local_layer);
updates.remove_historic(local_layer);
// fall-through to adding the remote layer
}
} else {
@@ -1782,7 +1718,7 @@ impl Timeline {
);
let remote_layer = Arc::new(remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
updates.insert_historic(remote_layer);
}
LayerFileName::Delta(deltafilename) => {
// Create a RemoteLayer for the delta file.
@@ -1809,7 +1745,7 @@ impl Timeline {
),
);
let remote_layer = Arc::new(remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
updates.insert_historic(remote_layer);
}
}
}
@@ -1952,30 +1888,9 @@ impl Timeline {
false,
// NB: don't log errors here, task_mgr will do that.
async move {
let cancel = task_mgr::shutdown_token();
// in case we were created during pageserver initialization, wait for
// initialization to complete before proceeding. startup time init runs on the same
// runtime.
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
};
// hold off background tasks from starting until all timelines get to try at least
// once initial logical size calculation; though retry will rarely be useful.
// holding off is done because heavier tasks execute blockingly on the same
// runtime.
//
// dropping this at every outcome is probably better than trying to cling on to it,
// delay will be terminated by a timeout regardless.
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
// no extra cancellation here, because nothing really waits for this to complete compared
// no cancellation here, because nothing really waits for this to complete compared
// to spawn_ondemand_logical_size_calculation.
let cancel = CancellationToken::new();
let calculated_size = match self_clone
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
.await
@@ -2240,7 +2155,7 @@ impl Timeline {
fn delete_historic_layer(
&self,
// we cannot remove layers otherwise, since gc and compaction will race
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
) -> anyhow::Result<()> {
@@ -2257,7 +2172,7 @@ impl Timeline {
// won't be needed for page reconstruction for this timeline,
// and mark what we can't delete yet as deleted from the layer
// map index without actually rebuilding the index.
updates.remove_historic(layer.layer_desc().clone(), layer);
updates.remove_historic(layer);
Ok(())
}
@@ -2319,9 +2234,6 @@ impl Timeline {
let mut timeline_owned;
let mut timeline = self;
let mut read_count =
scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
// For debugging purposes, collect the path of layers that we traversed
// through. It's included in the error message if we fail to find the key.
let mut traversal_path = Vec::<TraversalPathItem>::new();
@@ -2456,7 +2368,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((
result,
cont_lsn,
@@ -2483,7 +2394,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((
result,
cont_lsn,
@@ -2518,7 +2428,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
*read_count += 1;
traversal_path.push((
result,
cont_lsn,
@@ -2584,7 +2493,7 @@ impl Timeline {
(DownloadBehavior::Error, false) => {
return Err(PageReconstructError::NeedsDownload(
TenantTimelineId::new(self.tenant_id, self.timeline_id),
remote_layer.filename(),
remote_layer.file_name.clone(),
))
}
}
@@ -2710,7 +2619,7 @@ impl Timeline {
/// Layer flusher task's main loop.
async fn flush_loop(
self: &Arc<Self>,
&self,
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
ctx: &RequestContext,
) {
@@ -2799,9 +2708,9 @@ impl Timeline {
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
#[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
async fn flush_frozen_layer(
self: &Arc<Self>,
&self,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
@@ -2821,16 +2730,7 @@ impl Timeline {
.await?
} else {
// normal case, write out a L0 delta layer file.
let this = self.clone();
let frozen_layer = frozen_layer.clone();
let span = tracing::info_span!("blocking");
let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
let _g = span.entered();
this.create_delta_layer(&frozen_layer)
})
.await
.context("create_delta_layer spawn_blocking")
.and_then(|res| res)?;
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
HashMap::from([(delta_path, metadata)])
};
@@ -2934,7 +2834,7 @@ impl Timeline {
// Write out the given frozen in-memory layer as a new L0 delta file
fn create_delta_layer(
self: &Arc<Self>,
&self,
frozen_layer: &InMemoryLayer,
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
// Write it out
@@ -2950,13 +2850,10 @@ impl Timeline {
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
// files to flush, it might be better to first write them all, and then fsync
// them all in parallel.
// First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
// this with a single fsync in future refactors.
par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
// Then sync the parent directory.
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
.context("fsync of timeline dir")?;
par_fsync::par_fsync(&[
new_delta_path.clone(),
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
])?;
// Add it to the layer map
let l = Arc::new(new_delta);
@@ -2967,7 +2864,7 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
batch_updates.insert_historic(l.layer_desc().clone(), l);
batch_updates.insert_historic(l);
batch_updates.flush();
// update the timeline's physical size
@@ -3117,7 +3014,6 @@ impl Timeline {
self.tenant_id,
&img_range,
lsn,
false, // image layer always covers the full range
)?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -3181,22 +3077,17 @@ impl Timeline {
let all_paths = image_layers
.iter()
.map(|layer| layer.path())
.chain(std::iter::once(
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
))
.collect::<Vec<_>>();
par_fsync::par_fsync_async(&all_paths)
.await
.context("fsync of newly created layer files")?;
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
.await
.context("fsync of timeline dir")?;
par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
let mut layers = self.layers.write().unwrap();
let mut updates = layers.batch_update();
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
for l in image_layers {
let path = l.filename();
let metadata = timeline_path
@@ -3215,7 +3106,7 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
updates.insert_historic(l.layer_desc().clone(), l);
updates.insert_historic(l);
}
updates.flush();
drop(layers);
@@ -3255,9 +3146,9 @@ impl Timeline {
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
/// start of level0 files compaction, the on-demand download should be revisited as well.
fn compact_level0_phase1(
async fn compact_level0_phase1(
&self,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<CompactLevel0Phase1Result, CompactionError> {
@@ -3570,13 +3461,13 @@ impl Timeline {
if !new_layers.is_empty() {
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
// also sync the directory
layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
// Fsync all the layer files and directory using multiple threads to
// minimize latency.
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
.context("fsync of timeline dir")?;
layer_paths.pop().unwrap();
}
@@ -3593,26 +3484,17 @@ impl Timeline {
/// as Level 1 files.
///
async fn compact_level0(
self: &Arc<Self>,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
&self,
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
let this = self.clone();
let ctx_inner = ctx.clone();
let layer_removal_cs_inner = layer_removal_cs.clone();
let span = tracing::info_span!("blocking");
let CompactLevel0Phase1Result {
new_layers,
deltas_to_compact,
} = tokio::task::spawn_blocking(move || {
let _g = span.entered();
this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
})
.await
.context("compact_level0_phase1 spawn_blocking")
.map_err(CompactionError::Other)
.and_then(|res| res)?;
} = self
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
.await?;
if new_layers.is_empty() && deltas_to_compact.is_empty() {
// nothing to do
@@ -3662,7 +3544,7 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
updates.insert_historic(x.layer_desc().clone(), x);
updates.insert_historic(x);
}
// Now that we have reshuffled the data to set of new delta layers, we can
@@ -3670,7 +3552,7 @@ impl Timeline {
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
for l in deltas_to_compact {
layer_names_to_delete.push(l.filename());
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
}
updates.flush();
drop(layers);
@@ -3785,16 +3667,15 @@ impl Timeline {
/// within a layer file. We can only remove the whole file if it's fully
/// obsolete.
///
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
pub(super) async fn gc(&self, ctx: &RequestContext) -> anyhow::Result<GcResult> {
let timer = self.metrics.garbage_collect_histo.start_timer();
fail_point!("before-timeline-gc");
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_removal_cs.lock().await;
// Is the timeline being deleted?
let state = *self.state.borrow();
if state == TimelineState::Stopping {
// there's a global allowed_error for this
anyhow::bail!("timeline is Stopping");
}
@@ -3811,11 +3692,12 @@ impl Timeline {
let res = self
.gc_timeline(
layer_removal_cs.clone(),
&layer_removal_cs,
horizon_cutoff,
pitr_cutoff,
retain_lsns,
new_gc_cutoff,
ctx,
)
.instrument(
info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
@@ -3830,11 +3712,12 @@ impl Timeline {
async fn gc_timeline(
&self,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
new_gc_cutoff: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
let now = SystemTime::now();
let mut result: GcResult = GcResult::default();
@@ -3881,6 +3764,15 @@ impl Timeline {
let mut layers_to_remove = Vec::new();
let mut wanted_image_layers = KeySpaceRandomAccum::default();
// Do not collect keyspace for Unit tests
let gc_keyspace = if ctx.task_kind() == TaskKind::GarbageCollector {
Some(
self.collect_keyspace(self.get_last_record_lsn(), ctx)
.await?,
)
} else {
None
};
// Scan all layers in the timeline (remote or on-disk).
//
@@ -3970,8 +3862,14 @@ impl Timeline {
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
// But image layers are in any case less sparse than delta layers. Also we need some
// protection from replacing recent image layers with new one after each GC iteration.
if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
wanted_image_layers.add_range(l.get_key_range());
if l.is_incremental() && !LayerMap::is_l0(&*l) {
if let Some(keyspace) = &gc_keyspace {
let layer_logical_size = keyspace.get_logical_size(&l.get_key_range());
let layer_age = new_gc_cutoff.0 - l.get_lsn_range().start.0;
if layer_logical_size <= layer_age {
wanted_image_layers.add_range(l.get_key_range());
}
}
}
result.layers_not_updated += 1;
continue 'outer;
@@ -4003,11 +3901,7 @@ impl Timeline {
{
for doomed_layer in layers_to_remove {
layer_names_to_delete.push(doomed_layer.filename());
self.delete_historic_layer(
layer_removal_cs.clone(),
doomed_layer,
&mut updates,
)?; // FIXME: schedule succeeded deletions before returning?
self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
result.layers_removed += 1;
}
}
@@ -4179,7 +4073,7 @@ impl Timeline {
// Does retries + exponential back-off internally.
// When this fails, don't layer further retry attempts here.
let result = remote_client
.download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
.download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
.await;
if let Ok(size) = &result {
@@ -4197,7 +4091,7 @@ impl Timeline {
{
use crate::tenant::layer_map::Replacement;
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) {
let failure = match updates.replace_historic(&l, new_layer) {
Ok(Replacement::Replaced { .. }) => false,
Ok(Replacement::NotFound) => {
// TODO: the downloaded file should probably be removed, otherwise
@@ -4594,6 +4488,12 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
use utils::tracing_span_assert;
pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
tracing_span_assert::MultiNameExtractor<2>,
> = once_cell::sync::Lazy::new(|| {
tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
});
pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
tracing_span_assert::MultiNameExtractor<2>,
> = once_cell::sync::Lazy::new(|| {
@@ -4601,7 +4501,7 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
});
match tracing_span_assert::check_fields_present([
&*super::TENANT_ID_EXTRACTOR,
&*TENANT_ID_EXTRACTOR,
&*TIMELINE_ID_EXTRACTOR,
]) {
Ok(()) => (),

View File

@@ -34,8 +34,6 @@ use crate::{
},
};
use utils::completion;
use super::Timeline;
#[derive(Default)]
@@ -49,12 +47,8 @@ pub struct EvictionTaskTenantState {
}
impl Timeline {
pub(super) fn launch_eviction_task(
self: &Arc<Self>,
background_tasks_can_start: Option<&completion::Barrier>,
) {
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
let self_clone = Arc::clone(self);
let background_tasks_can_start = background_tasks_can_start.cloned();
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
@@ -63,13 +57,7 @@ impl Timeline {
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
false,
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); }
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
};
self_clone.eviction_task(cancel).await;
self_clone.eviction_task(task_mgr::shutdown_token()).await;
info!("eviction task finishing");
Ok(())
},

View File

@@ -25,19 +25,20 @@ mod walreceiver_connection;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
use anyhow::Context;
use std::future::Future;
use std::num::NonZeroU64;
use std::ops::ControlFlow;
use std::sync::Arc;
use std::sync::atomic::{self, AtomicBool};
use std::sync::{Arc, Weak};
use std::time::Duration;
use storage_broker::BrokerClientChannel;
use tokio::select;
use tokio::sync::watch;
use tokio::sync::{watch, RwLock};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -61,23 +62,46 @@ pub struct WalReceiverConf {
pub struct WalReceiver {
timeline: TenantTimelineId,
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
timeline_ref: Weak<Timeline>,
conf: WalReceiverConf,
started: AtomicBool,
manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
}
impl WalReceiver {
pub fn start(
timeline: Arc<Timeline>,
pub fn new(
timeline: TenantTimelineId,
timeline_ref: Weak<Timeline>,
conf: WalReceiverConf,
mut broker_client: BrokerClientChannel,
ctx: &RequestContext,
) -> Self {
Self {
timeline,
timeline_ref,
conf,
started: AtomicBool::new(false),
manager_status: Arc::new(RwLock::new(None)),
}
}
pub fn start(
&self,
ctx: &RequestContext,
mut broker_client: BrokerClientChannel,
) -> anyhow::Result<()> {
if self.started.load(atomic::Ordering::Acquire) {
anyhow::bail!("Wal receiver is already started");
}
let timeline = self.timeline_ref.upgrade().with_context(|| {
format!("walreceiver start on a dropped timeline {}", self.timeline)
})?;
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
let loop_status = Arc::new(std::sync::RwLock::new(None));
let manager_status = Arc::clone(&loop_status);
let wal_receiver_conf = self.conf.clone();
let loop_status = Arc::clone(&self.manager_status);
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
@@ -86,16 +110,15 @@ impl WalReceiver {
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
debug!("WAL receiver manager started, connecting to broker");
info!("WAL receiver manager started, connecting to broker");
let mut connection_manager_state = ConnectionManagerState::new(
timeline,
conf,
wal_receiver_conf,
);
loop {
select! {
_ = task_mgr::shutdown_watcher() => {
trace!("WAL receiver shutdown requested, shutting down");
info!("WAL receiver shutdown requested, shutting down");
break;
},
loop_step_result = connection_manager_loop_step(
@@ -106,7 +129,7 @@ impl WalReceiver {
) => match loop_step_result {
ControlFlow::Continue(()) => continue,
ControlFlow::Break(()) => {
trace!("Connection manager loop ended, shutting down");
info!("Connection manager loop ended, shutting down");
break;
}
},
@@ -114,29 +137,29 @@ impl WalReceiver {
}
connection_manager_state.shutdown().await;
*loop_status.write().unwrap() = None;
*loop_status.write().await = None;
Ok(())
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
);
Self {
timeline: TenantTimelineId::new(tenant_id, timeline_id),
manager_status,
}
self.started.store(true, atomic::Ordering::Release);
Ok(())
}
pub async fn stop(self) {
pub async fn stop(&self) {
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.timeline.tenant_id),
Some(self.timeline.timeline_id),
)
.await;
self.started.store(false, atomic::Ordering::Release);
}
pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
self.manager_status.read().unwrap().clone()
pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
self.manager_status.read().await.clone()
}
}
@@ -200,19 +223,29 @@ impl<E: Clone> TaskHandle<E> {
TaskEvent::End(match self.join_handle.as_mut() {
Some(jh) => {
if !jh.is_finished() {
// See: https://github.com/neondatabase/neon/issues/2885
trace!("sender is dropped while join handle is still alive");
// Barring any implementation errors in this module, we can
// only arrive here while the task that executes the future
// passed to `Self::spawn()` is still execution. Cf the comment
// in Self::spawn().
//
// This was logging at warning level in earlier versions, presumably
// to leave some breadcrumbs in case we had an implementation
// error that would would make us get stuck in `jh.await`.
//
// There hasn't been such a bug so far.
// But in a busy system, e.g., during pageserver restart,
// we arrive here often enough that the warning-level logs
// became a distraction.
// So, tone them down to info-level.
//
// XXX: rewrite this module to eliminate the race condition.
info!("sender is dropped while join handle is still alive");
}
let res = match jh.await {
Ok(res) => res,
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// already logged
Ok(())
}
Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
};
let res = jh
.await
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
.and_then(|x| x);
// For cancellation-safety, drop join_handle only after successful .await.
self.join_handle = None;
@@ -235,12 +268,12 @@ impl<E: Clone> TaskHandle<E> {
match jh.await {
Ok(Ok(())) => debug!("Shutdown success"),
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// already logged
}
Err(je) => {
error!("Shutdown task join error: {je}")
Err(join_error) => {
if join_error.is_cancelled() {
error!("Shutdown task was cancelled");
} else {
error!("Shutdown task join error: {join_error}")
}
}
}
}

View File

@@ -18,7 +18,7 @@ use crate::metrics::{
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::TaskKind;
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
use crate::tenant::Timeline;
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use pageserver_api::models::TimelineState;
@@ -28,7 +28,8 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::BrokerClientChannel;
use storage_broker::Streaming;
use tokio::select;
use tokio::sync::RwLock;
use tokio::{select, sync::watch};
use tracing::*;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -47,19 +48,16 @@ pub(super) async fn connection_manager_loop_step(
broker_client: &mut BrokerClientChannel,
connection_manager_state: &mut ConnectionManagerState,
ctx: &RequestContext,
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
manager_status: &RwLock<Option<ConnectionManagerStatus>>,
) -> ControlFlow<(), ()> {
match connection_manager_state
let mut timeline_state_updates = connection_manager_state
.timeline
.wait_to_become_active(ctx)
.await
{
Ok(()) => {}
Err(new_state) => {
debug!(
?new_state,
"state changed, stopping wal connection manager loop"
);
.subscribe_for_state_updates();
match wait_for_active_timeline(&mut timeline_state_updates).await {
ControlFlow::Continue(()) => {}
ControlFlow::Break(()) => {
info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
return ControlFlow::Break(());
}
}
@@ -74,15 +72,11 @@ pub(super) async fn connection_manager_loop_step(
timeline_id: connection_manager_state.timeline.timeline_id,
};
let mut timeline_state_updates = connection_manager_state
.timeline
.subscribe_for_state_updates();
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
debug!("Subscribed for broker timeline updates");
info!("Subscribed for broker timeline updates");
loop {
let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -154,12 +148,12 @@ pub(super) async fn connection_manager_loop_step(
// we're already active as walreceiver, no need to reactivate
TimelineState::Active => continue,
TimelineState::Broken | TimelineState::Stopping => {
debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
return ControlFlow::Break(());
}
TimelineState::Loading => {
warn!("timeline transitioned back to Loading state, that should not happen");
return ControlFlow::Continue(());
return ControlFlow::Continue(new_state);
}
}
}
@@ -167,11 +161,12 @@ pub(super) async fn connection_manager_loop_step(
}
}
} => match new_event {
ControlFlow::Continue(()) => {
ControlFlow::Continue(new_state) => {
info!("observed timeline state change, new state is {new_state:?}");
return ControlFlow::Continue(());
}
ControlFlow::Break(()) => {
debug!("Timeline is no longer active, stopping wal connection manager loop");
info!("Timeline dropped state updates sender, stopping wal connection manager loop");
return ControlFlow::Break(());
}
},
@@ -196,7 +191,35 @@ pub(super) async fn connection_manager_loop_step(
.change_connection(new_candidate, ctx)
.await
}
*manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
*manager_status.write().await = Some(connection_manager_state.manager_status());
}
}
async fn wait_for_active_timeline(
timeline_state_updates: &mut watch::Receiver<TimelineState>,
) -> ControlFlow<(), ()> {
let current_state = *timeline_state_updates.borrow();
if current_state == TimelineState::Active {
return ControlFlow::Continue(());
}
loop {
match timeline_state_updates.changed().await {
Ok(()) => {
let new_state = *timeline_state_updates.borrow();
match new_state {
TimelineState::Active => {
debug!("Timeline state changed to active, continuing the walreceiver connection manager");
return ControlFlow::Continue(());
}
state => {
debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
continue;
}
}
}
Err(_sender_dropped_error) => return ControlFlow::Break(()),
}
}
}
@@ -392,6 +415,7 @@ impl ConnectionManagerState {
self.drop_old_connection(true).await;
let id = self.id;
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let timeline = Arc::clone(&self.timeline);
@@ -399,13 +423,9 @@ impl ConnectionManagerState {
TaskKind::WalReceiverConnectionHandler,
DownloadBehavior::Download,
);
let span = info_span!("connection", %node_id);
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
let res = super::walreceiver_connection::handle_walreceiver_connection(
super::walreceiver_connection::handle_walreceiver_connection(
timeline,
new_sk.wal_source_connconf,
events_sender,
@@ -414,23 +434,12 @@ impl ConnectionManagerState {
ctx,
node_id,
)
.await;
match res {
Ok(()) => Ok(()),
Err(e) => {
use super::walreceiver_connection::ExpectedError;
if e.is_expected() {
info!("walreceiver connection handling ended: {e:#}");
Ok(())
} else {
// give out an error to have task_mgr give it a really verbose logging
Err(e).context("walreceiver connection handling failure")
}
}
}
.await
.context("walreceiver connection handling failure")
}
.instrument(span)
.instrument(
info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
)
});
let now = Utc::now().naive_utc();
@@ -1324,8 +1333,9 @@ mod tests {
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
.expect("Failed to create an empty timeline for dummy wal connection manager");
let timeline = timeline.initialize(&ctx).unwrap();
ConnectionManagerState {
id: TenantTimelineId {

View File

@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
use tokio::{select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, trace, warn, Instrument};
use tracing::{debug, error, info, trace, warn};
use super::TaskStateUpdate;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
tenant::{Timeline, WalReceiverInfo},
walingest::WalIngest,
walrecord::DecodedWALRecord,
};
@@ -81,8 +81,13 @@ pub(super) async fn handle_walreceiver_connection(
config.application_name("pageserver");
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
Ok(client_and_conn) => client_and_conn?,
Err(_elapsed) => {
Ok(Ok(client_and_conn)) => client_and_conn,
Ok(Err(conn_err)) => {
let expected_error = ignore_expected_errors(conn_err)?;
info!("DB connection stream finished: {expected_error}");
return Ok(());
}
Err(_) => {
// Timing out to connect to a safekeeper node could happen long time, due to
// many reasons that pageserver cannot control.
// Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -92,7 +97,7 @@ pub(super) async fn handle_walreceiver_connection(
}
};
debug!("connected!");
info!("connected!");
let mut connection_status = WalConnectionStatus {
is_connected: true,
has_processed_wal: false,
@@ -122,25 +127,20 @@ pub(super) async fn handle_walreceiver_connection(
"walreceiver connection",
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
select! {
connection_result = connection => match connection_result {
Ok(()) => debug!("Walreceiver db connection closed"),
Ok(()) => info!("Walreceiver db connection closed"),
Err(connection_error) => {
if connection_error.is_expected() {
// silence, because most likely we've already exited the outer call
// with a similar error.
} else {
warn!("Connection aborted: {connection_error:#}")
if let Err(e) = ignore_expected_errors(connection_error) {
warn!("Connection aborted: {e:#}")
}
}
},
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
// Future: replace connection_cancellation with connection_ctx cancellation
_ = connection_cancellation.cancelled() => info!("Connection cancelled"),
}
Ok(())
}
.instrument(tracing::info_span!("poller")),
},
);
// Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,13 +203,20 @@ pub(super) async fn handle_walreceiver_connection(
while let Some(replication_message) = {
select! {
_ = cancellation.cancelled() => {
debug!("walreceiver interrupted");
info!("walreceiver interrupted");
None
}
replication_message = physical_stream.next() => replication_message,
}
} {
let replication_message = replication_message?;
let replication_message = match replication_message {
Ok(message) => message,
Err(replication_error) => {
let expected_error = ignore_expected_errors(replication_error)?;
info!("Replication stream finished: {expected_error}");
return Ok(());
}
};
let now = Utc::now().naive_utc();
let last_rec_lsn_before_msg = last_rec_lsn;
@@ -254,6 +261,8 @@ pub(super) async fn handle_walreceiver_connection(
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(endlsn);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// let _enter = info_span!("processing record", lsn = %lsn).entered();
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
@@ -412,50 +421,31 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
}
}
/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
pub(super) trait ExpectedError {
/// Test if this error is an ok error.
///
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `true`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `false` the error should be propagated and the connection manager
/// will log the error at ERROR level.
fn is_expected(&self) -> bool;
}
impl ExpectedError for postgres::Error {
fn is_expected(&self) -> bool {
self.is_closed()
|| self
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
|| self
.as_db_error()
.filter(|db_error| {
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
})
.is_some()
}
}
impl ExpectedError for anyhow::Error {
fn is_expected(&self) -> bool {
let head = self.downcast_ref::<postgres::Error>();
let tail = self
.chain()
.filter_map(|e| e.downcast_ref::<postgres::Error>());
// check if self or any of the chained/sourced errors are expected
head.into_iter().chain(tail).any(|e| e.is_expected())
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `Ok(pg_error)`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `Err()`, the caller can bubble it up using `?`.
/// The connection manager will log the error at ERROR level.
fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
if pg_error.is_closed()
|| pg_error
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
{
return Ok(pg_error);
} else if let Some(db_error) = pg_error.as_db_error() {
if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
{
return Ok(pg_error);
}
}
Err(pg_error).context("connection error")
}

View File

@@ -379,6 +379,17 @@ impl XlXactParsedRecord {
});
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
xid = buf.get_u32_le();
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
}
if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
let nitems = buf.get_i32_le();
@@ -386,23 +397,7 @@ impl XlXactParsedRecord {
"XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
nitems
);
let sizeof_xl_xact_stats_item = 12;
buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap());
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
let sizeof_shared_invalidation_message = 16;
buf.advance(
(nmsgs * sizeof_shared_invalidation_message)
.try_into()
.unwrap(),
);
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
xid = buf.get_u32_le();
debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
//FIXME: do we need to handle dropped stats here?
}
XlXactParsedRecord {

View File

@@ -1,26 +0,0 @@
EXTENSION = hnsw
EXTVERSION = 0.1.0
MODULE_big = hnsw
DATA = $(wildcard *--*.sql)
OBJS = hnsw.o hnswalg.o
TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
# For auto-vectorization:
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
PG_CFLAGS += -O3
PG_CXXFLAGS += -O3 -std=c++11
PG_LDFLAGS += -lstdc++
all: $(EXTENSION)--$(EXTVERSION).sql
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
dist:
mkdir -p dist
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master

View File

@@ -1,25 +0,0 @@
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
This ANN extension of Postgres is based
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
<br>
Dmitry Baranchuk, Artem Babenko, Yury Malkov
# Postgres extension
HNSW index is hold in memory (built on demand) and it's maxial size is limited
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
described in the article).
# Example of usage:
```
create extension hnsw;
create table embeddings(id integer primary key, payload real[]);
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
```

View File

@@ -1,29 +0,0 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-- functions
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
COMMUTATOR = '<->'
);
-- access method
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-- opclasses
CREATE OPERATOR CLASS knn_ops
DEFAULT FOR TYPE real[] USING hnsw AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;

View File

@@ -1,551 +0,0 @@
#include "postgres.h"
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "access/relation.h"
#include "access/reloptions.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "storage/bufmgr.h"
#include "utils/guc.h"
#include "utils/selfuncs.h"
#include <math.h>
#include <float.h>
#include "hnsw.h"
PG_MODULE_MAGIC;
typedef struct {
int32 vl_len_; /* varlena header (do not touch directly!) */
int dims;
int maxelements;
int efConstruction;
int efSearch;
int M;
} HnswOptions;
static relopt_kind hnsw_relopt_kind;
typedef struct {
HierarchicalNSW* hnsw;
size_t curr;
size_t n_results;
ItemPointer results;
} HnswScanOpaqueData;
typedef HnswScanOpaqueData* HnswScanOpaque;
typedef struct {
Oid relid;
uint32 status;
HierarchicalNSW* hnsw;
} HnswHashEntry;
#define SH_PREFIX hnsw_index
#define SH_ELEMENT_TYPE HnswHashEntry
#define SH_KEY_TYPE Oid
#define SH_KEY relid
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->relid)
#define SH_HASH_KEY(tb, key) (key)
#define SH_EQUAL(tb, a, b) ((a) == (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
#define INDEX_HASH_SIZE 11
#define DEFAULT_EF_SEARCH 64
PGDLLEXPORT void _PG_init(void);
static hnsw_index_hash *hnsw_indexes;
/*
* Initialize index options and variables
*/
void
_PG_init(void)
{
hnsw_relopt_kind = add_reloption_kind();
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
100, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
16, 1, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
64, 1, INT_MAX, AccessExclusiveLock);
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
}
static void
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return;
array = DatumGetArrayTypeP(values[0]);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, tid, sizeof(*tid));
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
}
static void
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
{
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
Assert(indexInfo->ii_NumIndexAttrs == 1);
table_index_build_scan(heapRel, indexRel, indexInfo,
true, true, hnsw_build_callback, (void *) hnsw, NULL);
}
static HierarchicalNSW*
hnsw_get_index(Relation indexRel, Relation heapRel)
{
HierarchicalNSW* hnsw;
Oid indexoid = RelationGetRelid(indexRel);
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
if (entry == NULL)
{
size_t dims, maxelements;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t size_data_per_element;
size_t data_size;
dsm_handle handle = indexoid << 1; /* make it even */
void* impl_private = NULL;
void* mapped_address = NULL;
Size mapped_size = 0;
Size shmem_size;
bool exists = true;
bool found;
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
}
dims = opts->dims;
maxelements = opts->maxelements;
M = opts->M;
maxM = M * 2;
data_size = dims * sizeof(coord_t);
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
/* first try to attach to existed index */
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* index doesn't exists: try to create it */
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* We can do it under shared lock, so some other backend may
* try to initialize index. If create is failed because index already
* created by somebody else, then try to attach to it once again
*/
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, ERROR))
{
return NULL;
}
}
else
{
exists = false;
}
}
Assert(mapped_size == shmem_size);
hnsw = (HierarchicalNSW*)mapped_address;
if (!exists)
{
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
hnsw_populate(hnsw, indexRel, heapRel);
}
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
Assert(!found);
entry->hnsw = hnsw;
}
else
{
hnsw = entry->hnsw;
}
return hnsw;
}
/*
* Start or restart an index scan
*/
static IndexScanDesc
hnsw_beginscan(Relation index, int nkeys, int norderbys)
{
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
so->hnsw = hnsw_get_index(index, heap);
relation_close(heap, NoLock);
so->curr = 0;
so->n_results = 0;
so->results = NULL;
scan->opaque = so;
return scan;
}
/*
* Start or restart an index scan
*/
static void
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
{
pfree(so->results);
so->results = NULL;
}
so->curr = 0;
if (orderbys && scan->numberOfOrderBys > 0)
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
}
/*
* Fetch the next tuple in the given scan
*/
static bool
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
/*
* Index can be used to scan backward, but Postgres doesn't support
* backward scan on operators
*/
Assert(ScanDirectionIsForward(dir));
if (so->curr == 0)
{
Datum value;
ArrayType* array;
int n_items;
size_t n_results;
label_t* results;
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
/* Safety check */
if (scan->orderByData == NULL)
elog(ERROR, "cannot scan HNSW index without order");
/* No items will match if null */
if (scan->orderByData->sk_flags & SK_ISNULL)
return false;
value = scan->orderByData->sk_argument;
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(so->hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(so->hnsw));
}
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
elog(ERROR, "HNSW index search failed");
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
so->n_results = n_results;
for (size_t i = 0; i < n_results; i++)
{
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
}
free(results);
}
if (so->curr >= so->n_results)
{
return false;
}
else
{
scan->xs_heaptid = so->results[so->curr++];
scan->xs_recheckorderby = false;
return true;
}
}
/*
* End a scan and release resources
*/
static void
hnsw_endscan(IndexScanDesc scan)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
pfree(so->results);
pfree(so);
scan->opaque = NULL;
}
/*
* Estimate the cost of an index scan
*/
static void
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
Cost *indexStartupCost, Cost *indexTotalCost,
Selectivity *indexSelectivity, double *indexCorrelation
,double *indexPages
)
{
GenericCosts costs;
/* Never use index without order */
if (path->indexorderbys == NULL)
{
*indexStartupCost = DBL_MAX;
*indexTotalCost = DBL_MAX;
*indexSelectivity = 0;
*indexCorrelation = 0;
*indexPages = 0;
return;
}
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/* Startup cost and total cost are same */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
*indexPages = costs.numIndexPages;
}
/*
* Parse and validate the reloptions
*/
static bytea *
hnsw_options(Datum reloptions, bool validate)
{
static const relopt_parse_elt tab[] = {
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
};
return (bytea *) build_reloptions(reloptions, validate,
hnsw_relopt_kind,
sizeof(HnswOptions),
tab, lengthof(tab));
}
/*
* Validate catalog entries for the specified operator class
*/
static bool
hnsw_validate(Oid opclassoid)
{
return true;
}
/*
* Build the index for a logged table
*/
static IndexBuildResult *
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
return result;
}
/*
* Insert a tuple into the index
*/
static bool
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
Relation heap, IndexUniqueCheck checkUnique,
bool indexUnchanged,
IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
Datum value;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return false;
/* Detoast value */
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, heap_tid, sizeof(*heap_tid));
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
elog(ERROR, "HNSW index insert failed");
return true;
}
/*
* Build the index for an unlogged table
*/
static void
hnsw_buildempty(Relation index)
{
/* index will be constructed on dema nd when accessed */
}
/*
* Clean up after a VACUUM operation
*/
static IndexBulkDeleteResult *
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
Relation rel = info->index;
if (stats == NULL)
return NULL;
stats->num_pages = RelationGetNumberOfBlocks(rel);
return stats;
}
/*
* Bulk delete tuples from the index
*/
static IndexBulkDeleteResult *
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
return stats;
}
/*
* Define index handler
*
* See https://www.postgresql.org/docs/current/index-api.html
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
Datum
hnsw_handler(PG_FUNCTION_ARGS)
{
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
amroutine->amstrategies = 0;
amroutine->amsupport = 0;
amroutine->amoptsprocnum = 0;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = true;
amroutine->amcanbackward = false; /* can change direction mid-scan */
amroutine->amcanunique = false;
amroutine->amcanmulticol = false;
amroutine->amoptionalkey = true;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
amroutine->amstorage = false;
amroutine->amclusterable = false;
amroutine->ampredlocks = false;
amroutine->amcanparallel = false;
amroutine->amcaninclude = false;
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
amroutine->amkeytype = InvalidOid;
/* Interface functions */
amroutine->ambuild = hnsw_build;
amroutine->ambuildempty = hnsw_buildempty;
amroutine->aminsert = hnsw_insert;
amroutine->ambulkdelete = hnsw_bulkdelete;
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
amroutine->amcostestimate = hnsw_costestimate;
amroutine->amoptions = hnsw_options;
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
amroutine->ambuildphasename = NULL;
amroutine->amvalidate = hnsw_validate;
amroutine->amadjustmembers = NULL;
amroutine->ambeginscan = hnsw_beginscan;
amroutine->amrescan = hnsw_rescan;
amroutine->amgettuple = hnsw_gettuple;
amroutine->amgetbitmap = NULL;
amroutine->amendscan = hnsw_endscan;
amroutine->ammarkpos = NULL;
amroutine->amrestrpos = NULL;
/* Interface functions to support parallel index scans */
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
PG_RETURN_POINTER(amroutine);
}
/*
* Get the L2 distance between vectors
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
Datum
l2_distance(PG_FUNCTION_ARGS)
{
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
dist_t distance = 0.0;
dist_t diff;
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
if (a_dim != b_dim)
{
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
}
for (int i = 0; i < a_dim; i++)
{
diff = ax[i] - bx[i];
distance += diff * diff;
}
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
}

View File

@@ -1,5 +0,0 @@
comment = 'hNsw index'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true
trusted = true

View File

@@ -1,15 +0,0 @@
#pragma once
typedef float coord_t;
typedef float dist_t;
typedef uint32_t idx_t;
typedef uint64_t label_t;
typedef struct HierarchicalNSW HierarchicalNSW;
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
int hnsw_dimensions(HierarchicalNSW* hnsw);
size_t hnsw_count(HierarchicalNSW* hnsw);
size_t hnsw_sizeof(void);

View File

@@ -1,379 +0,0 @@
#include "hnswalg.h"
#if defined(__GNUC__)
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
#else
#define PORTABLE_ALIGN32 __declspec(align(32))
#define PREFETCH(addr,hint)
#endif
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
{
dim = dim_;
data_size = dim * sizeof(coord_t);
efConstruction = efConstruction_;
maxelements = maxelements_;
M = M_;
maxM = maxM_;
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
offset_data = size_links_level0;
offset_label = offset_data + data_size;
enterpoint_node = 0;
cur_element_count = 0;
#ifdef __x86_64__
use_avx2 = __builtin_cpu_supports("avx2");
#endif
}
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
{
std::vector<uint32_t> visited;
visited.resize((cur_element_count + 31) >> 5);
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
topResults.emplace(dist, enterpoint_node);
candidateSet.emplace(-dist, enterpoint_node);
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
dist_t lowerBound = dist;
while (!candidateSet.empty())
{
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
if (-curr_el_pair.first > lowerBound)
break;
candidateSet.pop();
idx_t curNodeNum = curr_el_pair.second;
idx_t* data = get_linklist0(curNodeNum);
size_t size = *data++;
PREFETCH(getDataByInternalId(*data), 0);
for (size_t j = 0; j < size; ++j) {
size_t tnum = *(data + j);
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
visited[tnum >> 5] |= 1 << (tnum & 31);
dist = fstdistfunc(point, getDataByInternalId(tnum));
if (topResults.top().first > dist || topResults.size() < ef) {
candidateSet.emplace(-dist, tnum);
PREFETCH(get_linklist0(candidateSet.top().second), 0);
topResults.emplace(dist, tnum);
if (topResults.size() > ef)
topResults.pop();
lowerBound = topResults.top().first;
}
}
}
}
return topResults;
}
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
{
if (topResults.size() < NN)
return;
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
std::vector<std::pair<dist_t, idx_t>> returnlist;
while (topResults.size() > 0) {
resultSet.emplace(-topResults.top().first, topResults.top().second);
topResults.pop();
}
while (resultSet.size()) {
if (returnlist.size() >= NN)
break;
std::pair<dist_t, idx_t> curen = resultSet.top();
dist_t dist_to_query = -curen.first;
resultSet.pop();
bool good = true;
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
getDataByInternalId(curen.second));
if (curdist < dist_to_query) {
good = false;
break;
}
}
if (good) returnlist.push_back(curen);
}
for (std::pair<dist_t, idx_t> elem : returnlist)
topResults.emplace(-elem.first, elem.second);
}
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
{
getNeighborsByHeuristic(topResults, M);
std::vector<idx_t> res;
res.reserve(M);
while (topResults.size() > 0) {
res.push_back(topResults.top().second);
topResults.pop();
}
{
idx_t* data = get_linklist0(cur_c);
if (*data)
throw std::runtime_error("Should be blank");
*data++ = res.size();
for (size_t idx = 0; idx < res.size(); idx++) {
if (data[idx])
throw std::runtime_error("Should be blank");
data[idx] = res[idx];
}
}
for (size_t idx = 0; idx < res.size(); idx++) {
if (res[idx] == cur_c)
throw std::runtime_error("Connection to the same element");
size_t resMmax = maxM;
idx_t *ll_other = get_linklist0(res[idx]);
idx_t sz_link_list_other = *ll_other;
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
throw std::runtime_error("Bad sz_link_list_other");
if (sz_link_list_other < resMmax) {
idx_t *data = ll_other + 1;
data[sz_link_list_other] = cur_c;
*ll_other = sz_link_list_other + 1;
} else {
// finding the "weakest" element to replace it with the new one
idx_t *data = ll_other + 1;
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
// Heuristic:
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
candidates.emplace(d_max, cur_c);
for (size_t j = 0; j < sz_link_list_other; j++)
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
getNeighborsByHeuristic(candidates, resMmax);
size_t indx = 0;
while (!candidates.empty()) {
data[indx] = candidates.top().second;
candidates.pop();
indx++;
}
*ll_other = indx;
}
}
}
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
{
if (cur_element_count >= maxelements) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}
idx_t cur_c = cur_element_count++;
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
memcpy(getDataByInternalId(cur_c), point, data_size);
memcpy(getExternalLabel(cur_c), &label, sizeof label);
// Do nothing for the first element
if (cur_c != 0) {
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
mutuallyConnectNewElement(point, cur_c, topResults);
}
};
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
{
std::priority_queue<std::pair<dist_t, label_t>> topResults;
auto topCandidates = searchBaseLayer(query, k);
while (topCandidates.size() > k) {
topCandidates.pop();
}
while (!topCandidates.empty()) {
std::pair<dist_t, idx_t> rez = topCandidates.top();
label_t label;
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
topCandidates.pop();
}
return topResults;
};
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
{
dist_t distance = 0.0;
for (size_t i = 0; i < n; i++)
{
dist_t diff = x[i] - y[i];
distance += diff * diff;
}
return distance;
}
#ifdef __x86_64__
#include <immintrin.h>
__attribute__((target("avx2")))
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (x < pEnd1) {
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
return (res);
}
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (x < pEnd1) {
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}
_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return res;
}
#endif
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
{
#ifndef __x86_64__
return fstdistfunc_scalar(x, y, dim);
#else
if(use_avx2)
return fstdistfunc_avx2(x, y, dim);
return fstdistfunc_sse(x, y, dim);
#endif
}
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
{
try
{
auto result = hnsw->searchKnn(point, efSearch);
size_t nResults = result.size();
*results = (label_t*)malloc(nResults*sizeof(label_t));
for (size_t i = nResults; i-- != 0;)
{
(*results)[i] = result.top().second;
result.pop();
}
*n_results = nResults;
return true;
}
catch (std::exception& x)
{
return false;
}
}
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
{
try
{
hnsw->addPoint(point, label);
return true;
}
catch (std::exception& x)
{
fprintf(stderr, "Catch %s\n", x.what());
return false;
}
}
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
{
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
}
int hnsw_dimensions(HierarchicalNSW* hnsw)
{
return (int)hnsw->dim;
}
size_t hnsw_count(HierarchicalNSW* hnsw)
{
return hnsw->cur_element_count;
}
size_t hnsw_sizeof(void)
{
return sizeof(HierarchicalNSW);
}

View File

@@ -1,69 +0,0 @@
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unordered_map>
#include <unordered_set>
#include <map>
#include <cmath>
#include <queue>
#include <stdexcept>
extern "C" {
#include "hnsw.h"
}
struct HierarchicalNSW
{
size_t maxelements;
size_t cur_element_count;
idx_t enterpoint_node;
size_t dim;
size_t data_size;
size_t offset_data;
size_t offset_label;
size_t size_data_per_element;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t efConstruction;
#ifdef __x86_64__
bool use_avx2;
#endif
char data_level0_memory[0]; // varying size
public:
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
~HierarchicalNSW();
inline coord_t *getDataByInternalId(idx_t internal_id) const {
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
}
inline idx_t *get_linklist0(idx_t internal_id) const {
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
}
inline label_t *getExternalLabel(idx_t internal_id) const {
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
}
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
void addPoint(const coord_t *point, label_t label);
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
};

View File

@@ -1,28 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
QUERY PLAN
--------------------------------------------------------------------
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
Order By: (val <-> '{3,3,3}'::real[])
(2 rows)
SELECT * FROM t ORDER BY val <-> array[3,3,3];
val
---------
{1,2,3}
{1,2,4}
{1,1,1}
{0,0,0}
(4 rows)
SELECT COUNT(*) FROM t;
count
-------
5
(1 row)
DROP TABLE t;

View File

@@ -1,13 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT COUNT(*) FROM t;
DROP TABLE t;

Some files were not shown because too many files have changed in this diff Show More