Compare commits

..

1 Commits

Author SHA1 Message Date
Heikki Linnakangas
f011af063d Increase timeout when stopping a service in tests.
I've seen a few test failures with error:

    pageserver stop failed: pageserver with pid 115851 did not stop in 10 seconds

These have all been with tests that use real S3. Pageserver shutdown
waits for all in-memory layers to be flush to disk and uploaded to
remote storage, so I think it's reasonable that that might take longer
than 10 s if there's some kind of a network hiccup.
2023-05-14 17:51:14 +03:00
334 changed files with 9640 additions and 24564 deletions

View File

@@ -12,11 +12,5 @@ opt-level = 3
# Turn on a small amount of optimization in Development mode. # Turn on a small amount of optimization in Development mode.
opt-level = 1 opt-level = 1
[build]
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
[alias] [alias]
build_testing = ["build", "--features", "testing"] build_testing = ["build", "--features", "testing"]
neon = ["run", "--bin", "neon_local"]

View File

@@ -21,5 +21,4 @@
!workspace_hack/ !workspace_hack/
!neon_local/ !neon_local/
!scripts/ninstall.sh !scripts/ninstall.sh
!scripts/combine_control_files.py
!vm-cgconfig.conf !vm-cgconfig.conf

View File

@@ -57,14 +57,14 @@ runs:
if ! which allure; then if ! which allure; then
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
unzip -q ${ALLURE_ZIP} unzip -q ${ALLURE_ZIP}
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
rm -f ${ALLURE_ZIP} rm -f ${ALLURE_ZIP}
fi fi
env: env:
ALLURE_VERSION: 2.22.1 ALLURE_VERSION: 2.22.0
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock - name: Acquire lock
@@ -105,7 +105,7 @@ runs:
# Get previously uploaded data for this run # Get previously uploaded data for this run
ZSTD_NBTHREADS=0 ZSTD_NBTHREADS=0
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key') S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key')
if [ -z "$S3_FILEPATHS" ]; then if [ -z "$S3_FILEPATHS" ]; then
# There's no previously uploaded data for this $GITHUB_RUN_ID # There's no previously uploaded data for this $GITHUB_RUN_ID
exit 0 exit 0
@@ -147,8 +147,6 @@ runs:
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
- name: Release lock - name: Release lock
if: always() if: always()
shell: bash -euxo pipefail {0} shell: bash -euxo pipefail {0}

View File

@@ -36,14 +36,18 @@ inputs:
description: 'Region name for real s3 tests' description: 'Region name for real s3 tests'
required: false required: false
default: '' default: ''
real_s3_access_key_id:
description: 'Access key id'
required: false
default: ''
real_s3_secret_access_key:
description: 'Secret access key'
required: false
default: ''
rerun_flaky: rerun_flaky:
description: 'Whether to rerun flaky tests' description: 'Whether to rerun flaky tests'
required: false required: false
default: 'false' default: 'false'
pg_version:
description: 'Postgres version to use for tests'
required: false
default: 'v14'
runs: runs:
using: "composite" using: "composite"
@@ -63,12 +67,12 @@ runs:
path: /tmp/neon-previous path: /tmp/neon-previous
prefix: latest prefix: latest
- name: Download compatibility snapshot - name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote' if: inputs.build_type != 'remote'
uses: ./.github/actions/download uses: ./.github/actions/download
with: with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg14
prefix: latest prefix: latest
- name: Checkout - name: Checkout
@@ -96,18 +100,19 @@ runs:
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
TEST_OUTPUT: /tmp/test_output TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }} BUILD_TYPE: ${{ inputs.build_type }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FLAKY: ${{ inputs.rerun_flaky }} RERUN_FLAKY: ${{ inputs.rerun_flaky }}
PG_VERSION: ${{ inputs.pg_version }}
shell: bash -euxo pipefail {0} shell: bash -euxo pipefail {0}
run: | run: |
# PLATFORM will be embedded in the perf test report # PLATFORM will be embedded in the perf test report
# and it is needed to distinguish different environments # and it is needed to distinguish different environments
export PLATFORM=${PLATFORM:-github-actions-selfhosted} export PLATFORM=${PLATFORM:-github-actions-selfhosted}
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
export DEFAULT_PG_VERSION=${PG_VERSION#v} export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
if [ "${BUILD_TYPE}" = "remote" ]; then if [ "${BUILD_TYPE}" = "remote" ]; then
export REMOTE_ENV=1 export REMOTE_ENV=1
@@ -150,14 +155,6 @@ runs:
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
fi fi
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
mkdir -p $TEST_OUTPUT
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
fi
if [[ "${{ inputs.build_type }}" == "debug" ]]; then if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -195,13 +192,13 @@ runs:
scripts/generate_and_push_perf_report.sh scripts/generate_and_push_perf_report.sh
fi fi
- name: Upload compatibility snapshot - name: Upload compatibility snapshot for Postgres 14
if: github.ref_name == 'release' if: github.ref_name == 'release'
uses: ./.github/actions/upload uses: ./.github/actions/upload
with: with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }} name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ path: /tmp/test_output/compatibility_snapshot_pg14/
prefix: latest prefix: latest
- name: Upload test results - name: Upload test results
@@ -209,4 +206,4 @@ runs:
uses: ./.github/actions/allure-report-store uses: ./.github/actions/allure-report-store
with: with:
report-dir: /tmp/test_output/allure/results report-dir: /tmp/test_output/allure/results
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} unique-key: ${{ inputs.test_selection }}-${{ inputs.build_type }}

View File

@@ -1,6 +1,6 @@
## Problem ## Describe your changes
## Summary of changes ## Issue ticket number and link
## Checklist before requesting a review ## Checklist before requesting a review

View File

@@ -1,55 +0,0 @@
name: Handle `approved-for-ci-run` label
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
on:
pull_request:
types:
# Default types that triggers a workflow ([1]):
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
- opened
- synchronize
- reopened
# Types that we wand to handle in addition to keep labels tidy:
- closed
# Actual magic happens here:
- labeled
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
jobs:
remove-label:
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
# The PR should be reviewed and labelled manually again.
runs-on: [ ubuntu-latest ]
if: |
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
create-branch:
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
runs-on: [ ubuntu-latest ]
if: |
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v3
with:
ref: main
- run: gh pr checkout "${PR_NUMBER}"
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"

View File

@@ -16,12 +16,12 @@ on:
workflow_dispatch: # adds ability to run this manually workflow_dispatch: # adds ability to run this manually
inputs: inputs:
region_id: region_id:
description: 'Project region id. If not set, the default region will be used' description: 'Use a particular region. If not set the default region will be used'
required: false required: false
default: 'aws-us-east-2' default: 'aws-us-east-2'
save_perf_report: save_perf_report:
type: boolean type: boolean
description: 'Publish perf report. If not set, the report will be published only for the main branch' description: 'Publish perf report or not. If not set, the report is published only for the main branch'
required: false required: false
defaults: defaults:
@@ -125,14 +125,13 @@ jobs:
matrix='{ matrix='{
"platform": [ "platform": [
"neon-captest-new", "neon-captest-new",
"neon-captest-reuse", "neon-captest-reuse"
"neonvm-captest-new"
], ],
"db_size": [ "10gb" ], "db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, "include": [
{ "platform": "neon-captest-new", "db_size": "50gb" }, { "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" }, { "platform": "neon-captest-new", "db_size": "50gb" }
{ "platform": "neonvm-captest-new", "db_size": "50gb" }] ]
}' }'
if [ "$(date +%A)" = "Saturday" ]; then if [ "$(date +%A)" = "Saturday" ]; then
@@ -180,8 +179,7 @@ jobs:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init options: --init
# Increase timeout to 8h, default timeout is 6h timeout-minutes: 360 # 6h
timeout-minutes: 480
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@@ -199,7 +197,7 @@ jobs:
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Create Neon Project - name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform)
id: create-neon-project id: create-neon-project
uses: ./.github/actions/neon-project-create uses: ./.github/actions/neon-project-create
with: with:
@@ -207,7 +205,6 @@ jobs:
postgres_version: ${{ env.DEFAULT_PG_VERSION }} postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }} api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String - name: Set up Connection String
id: set-up-connstr id: set-up-connstr
@@ -216,7 +213,7 @@ jobs:
neon-captest-reuse) neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;; ;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) neon-captest-new | neon-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }} CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;; ;;
rds-aurora) rds-aurora)
@@ -226,7 +223,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;; ;;
*) *)
echo >&2 "Unknown PLATFORM=${PLATFORM}" echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'"
exit 1 exit 1
;; ;;
esac esac
@@ -322,6 +319,8 @@ jobs:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init options: --init
timeout-minutes: 360 # 6h
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@@ -413,6 +412,8 @@ jobs:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init options: --init
timeout-minutes: 360 # 6h
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@@ -498,6 +499,8 @@ jobs:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init options: --init
timeout-minutes: 360 # 6h
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3

View File

@@ -5,7 +5,6 @@ on:
branches: branches:
- main - main
- release - release
- ci-run/pr-*
pull_request: pull_request:
defaults: defaults:
@@ -128,11 +127,6 @@ jobs:
- name: Run cargo clippy (release) - name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting - name: Check formatting
if: ${{ !cancelled() }} if: ${{ !cancelled() }}
@@ -161,7 +155,7 @@ jobs:
build_type: [ debug, release ] build_type: [ debug, release ]
env: env:
BUILD_TYPE: ${{ matrix.build_type }} BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} GIT_VERSION: ${{ github.sha }}
steps: steps:
- name: Fix git ownership - name: Fix git ownership
@@ -180,27 +174,6 @@ jobs:
submodules: true submodules: true
fetch-depth: 1 fetch-depth: 1
- name: Check Postgres submodules revision
shell: bash -euo pipefail {0}
run: |
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
FAILED=false
for postgres in postgres-v14 postgres-v15; do
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
actual=$(git rev-parse "HEAD:vendor/${postgres}")
if [ "${expected}" != "${actual}" ]; then
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
FAILED=true
fi
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
exit 1
fi
- name: Set pg 14 revision for caching - name: Set pg 14 revision for caching
id: pg_v14_rev id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -291,7 +264,7 @@ jobs:
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1 export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
- name: Install rust binaries - name: Install rust binaries
run: | run: |
@@ -351,8 +324,7 @@ jobs:
runs-on: [ self-hosted, gen3, large ] runs-on: [ self-hosted, gen3, large ]
container: container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb options: --init
options: --init --shm-size=512mb
needs: [ build-neon ] needs: [ build-neon ]
strategy: strategy:
fail-fast: false fail-fast: false
@@ -373,11 +345,13 @@ jobs:
test_selection: regress test_selection: regress
needs_postgres_source: true needs_postgres_source: true
run_with_real_s3: true run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests real_s3_bucket: ci-tests-s3
real_s3_region: eu-central-1 real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
rerun_flaky: true rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env: env:
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
@@ -389,18 +363,19 @@ jobs:
runs-on: [ self-hosted, gen3, small ] runs-on: [ self-hosted, gen3, small ]
container: container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb options: --init
options: --init --shm-size=512mb
needs: [ build-neon ] needs: [ build-neon ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
pytest_split_group: [ 1, 2, 3, 4 ]
build_type: [ release ] build_type: [ release ]
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Pytest benchmarks - name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -409,11 +384,9 @@ jobs:
test_selection: performance test_selection: performance
run_in_parallel: false run_in_parallel: false
save_perf_report: ${{ github.ref_name == 'main' }} save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
env: env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds, # XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones # while coverage is currently collected for the debug ones
@@ -434,7 +407,9 @@ jobs:
uses: ./.github/actions/allure-report-generate uses: ./.github/actions/allure-report-generate
- uses: actions/github-script@v6 - uses: actions/github-script@v6
if: ${{ !cancelled() }} if: >
!cancelled() &&
github.event_name == 'pull_request'
with: with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5 retries: 5
@@ -444,7 +419,7 @@ jobs:
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
} }
const script = require("./scripts/comment-test-report.js") const script = require("./scripts/pr-comment-test-report.js")
await script({ await script({
github, github,
context, context,
@@ -515,48 +490,37 @@ jobs:
- name: Merge coverage data - name: Merge coverage data
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
- name: Build coverage report - name: Build and upload coverage report
env:
COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
run: | run: |
scripts/coverage --dir=/tmp/coverage \ COMMIT_SHA=${{ github.event.pull_request.head.sha }}
report \ COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
scripts/coverage \
--dir=/tmp/coverage report \
--input-objects=/tmp/coverage/binaries.list \ --input-objects=/tmp/coverage/binaries.list \
--commit-url=${COMMIT_URL} \ --commit-url=$COMMIT_URL \
--format=github --format=github
scripts/coverage --dir=/tmp/coverage \ REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
report \
--input-objects=/tmp/coverage/binaries.list \
--format=lcov
- name: Upload coverage report scripts/git-upload \
id: upload-coverage-report --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
env: --message="Add code coverage for $COMMIT_URL" \
BUCKET: neon-github-public-dev copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html # Add link to the coverage report to the commit
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
- uses: actions/github-script@v6 -H "Accept: application/vnd.github.v3+json" \
env: --user "${{ secrets.CI_ACCESS_TOKEN }}" \
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} --data \
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} "{
with: \"state\": \"success\",
script: | \"context\": \"neon-coverage\",
const { REPORT_URL, COMMIT_SHA } = process.env \"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
await github.rest.repos.createCommitStatus({ }"
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${COMMIT_SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: 'Code coverage report',
})
trigger-e2e-tests: trigger-e2e-tests:
runs-on: [ self-hosted, gen3, small ] runs-on: [ self-hosted, gen3, small ]
@@ -641,7 +605,7 @@ jobs:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context . --context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg GIT_VERSION=${{ github.sha }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -650,6 +614,48 @@ jobs:
- name: Cleanup ECR folder - name: Cleanup ECR folder
run: rm -rf ~/.ecr run: rm -rf ~/.ecr
neon-image-depot:
# For testing this will run side-by-side for a few merges.
# This action is not really optimized yet, but gets the job done
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
permissions:
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup go
uses: actions/setup-go@v3
with:
go-version: '1.19'
- name: Set up Depot CLI
uses: depot/setup-action@v1
- name: Install Crane & ECR helper
run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Build and push
uses: depot/build-push-action@v1
with:
# if no depot.json file is at the root of your repo, you must specify the project id
project: nrdv0s4kcs
push: true
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
compute-tools-image: compute-tools-image:
runs-on: [ self-hosted, gen3, large ] runs-on: [ self-hosted, gen3, large ]
needs: [ tag ] needs: [ tag ]
@@ -685,8 +691,7 @@ jobs:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context . --context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg GIT_VERSION=${{ github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools --dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -698,11 +703,7 @@ jobs:
compute-node-image: compute-node-image:
runs-on: [ self-hosted, gen3, large ] runs-on: [ self-hosted, gen3, large ]
container: container: gcr.io/kaniko-project/executor:v1.9.2-debug
image: gcr.io/kaniko-project/executor:v1.9.2-debug
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
options: --add-host=download.osgeo.org:140.211.15.30
needs: [ tag ] needs: [ tag ]
strategy: strategy:
fail-fast: false fail-fast: false
@@ -742,42 +743,12 @@ jobs:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context . --context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg GIT_VERSION=${{ github.sha }}
--build-arg PG_VERSION=${{ matrix.version }} --build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node --dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
# so we won't build extension twice, but extract them from compute-node.
#
# For now we use extensions image only for new custom extensitons
- name: Kaniko build extensions only
run: |
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
# it still fails with error:
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
#
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--context . \
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
--dockerfile Dockerfile.compute-node \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--cleanup \
--target postgres-extensions
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder - name: Cleanup ECR folder
@@ -794,7 +765,7 @@ jobs:
run: run:
shell: sh -eu {0} shell: sh -eu {0}
env: env:
VM_BUILDER_VERSION: v0.13.1 VM_BUILDER_VERSION: v0.4.6
steps: steps:
- name: Checkout - name: Checkout
@@ -804,18 +775,21 @@ jobs:
- name: Downloading vm-builder - name: Downloading vm-builder
run: | run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder chmod +x vm-builder
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image - name: Pulling compute-node image
run: | run: |
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Building VM compute-node rootfs
run: |
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
- name: Build vm image - name: Build vm image
run: | run: |
./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image - name: Pushing vm-compute-node image
run: | run: |
@@ -896,10 +870,8 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR - name: Push images to production ECR
if: | if: |
@@ -910,10 +882,8 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
- name: Configure Docker Hub login - name: Configure Docker Hub login
run: | run: |
@@ -935,65 +905,16 @@ jobs:
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder - name: Cleanup ECR folder
run: rm -rf ~/.ecr run: rm -rf ~/.ecr
upload-postgres-extensions-to-s3:
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
needs: [ tag, promote-images ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
env:
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
- name: Create postgres-extensions container
id: create-container
run: |
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload # Just in case
mkdir -p extensions-to-upload
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
- name: Upload postgres-extensions to S3
run: |
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && steps.create-container.outputs.EID }}
run: |
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy: deploy:
runs-on: [ self-hosted, gen3, small ] runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ] needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps: steps:
- name: Fix git ownership - name: Fix git ownership
@@ -1025,24 +946,10 @@ jobs:
exit 1 exit 1
fi fi
- name: Create git tag
if: github.ref_name == 'release'
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
sha: context.sha,
})
promote-compatibility-data: promote-compatibility-data:
runs-on: [ self-hosted, gen3, small ] runs-on: [ self-hosted, gen3, small ]
container: container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init options: --init
needs: [ promote-images, tag, regress-tests ] needs: [ promote-images, tag, regress-tests ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
@@ -1053,13 +960,11 @@ jobs:
PREFIX: artifacts/latest PREFIX: artifacts/latest
run: | run: |
# Update compatibility snapshot for the release # Update compatibility snapshot for the release
for pg_version in v14 v15; do for build_type in debug release; do
for build_type in debug release; do OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
done done
# Update Neon artifact for the release (reuse already uploaded artifact) # Update Neon artifact for the release (reuse already uploaded artifact)

View File

@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
on: on:
push: push:
branches: branches:
- main - main
- ci-run/pr-*
pull_request: pull_request:
defaults: defaults:

View File

@@ -3,7 +3,6 @@ name: Create Release Branch
on: on:
schedule: schedule:
- cron: '0 10 * * 2' - cron: '0 10 * * 2'
workflow_dispatch:
jobs: jobs:
create_release_branch: create_release_branch:

View File

@@ -2,7 +2,7 @@
Howdy! Usual good software engineering practices apply. Write Howdy! Usual good software engineering practices apply. Write
tests. Write comments. Follow standard Rust coding practices where tests. Write comments. Follow standard Rust coding practices where
possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting. possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
There are soft spots in the code, which could use cleanup, There are soft spots in the code, which could use cleanup,
refactoring, additional comments, and so forth. Let's try to raise the refactoring, additional comments, and so forth. Let's try to raise the

1179
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,26 +3,12 @@ members = [
"compute_tools", "compute_tools",
"control_plane", "control_plane",
"pageserver", "pageserver",
"pageserver/ctl",
"proxy", "proxy",
"safekeeper", "safekeeper",
"storage_broker", "storage_broker",
"workspace_hack", "workspace_hack",
"trace", "trace",
"libs/compute_api", "libs/*",
"libs/pageserver_api",
"libs/postgres_ffi",
"libs/safekeeper_api",
"libs/utils",
"libs/consumption_metrics",
"libs/postgres_backend",
"libs/pq_proto",
"libs/tenant_size_model",
"libs/metrics",
"libs/postgres_connection",
"libs/remote_storage",
"libs/tracing-utils",
"libs/postgres_ffi/wal_craft",
] ]
[workspace.package] [workspace.package]
@@ -32,14 +18,12 @@ license = "Apache-2.0"
## All dependency versions, used in the project ## All dependency versions, used in the project
[workspace.dependencies] [workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] } anyhow = { version = "1.0", features = ["backtrace"] }
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
flate2 = "1.0.26"
async-stream = "0.3" async-stream = "0.3"
async-trait = "0.1" async-trait = "0.1"
aws-config = { version = "0.55", default-features = false, features=["rustls"] } atty = "0.2.14"
aws-sdk-s3 = "0.27" aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
aws-smithy-http = "0.55" aws-sdk-s3 = "0.21.0"
aws-credential-types = "0.55" aws-smithy-http = "0.51.0"
aws-types = "0.55" aws-types = "0.55"
base64 = "0.13.0" base64 = "0.13.0"
bincode = "1.3" bincode = "1.3"
@@ -84,9 +68,9 @@ notify = "5.0.0"
num_cpus = "1.15" num_cpus = "1.15"
num-traits = "0.2.15" num-traits = "0.2.15"
once_cell = "1.13" once_cell = "1.13"
opentelemetry = "0.19.0" opentelemetry = "0.18.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0" opentelemetry-semantic-conventions = "0.10.0"
parking_lot = "0.12" parking_lot = "0.12"
pin-project-lite = "0.2" pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -94,9 +78,8 @@ prost = "0.11"
rand = "0.8" rand = "0.8"
regex = "1.4" regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
reqwest-middleware = "0.2.0" reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3" routerify = "3"
rpds = "0.13" rpds = "0.13"
rustls = "0.20" rustls = "0.20"
@@ -123,15 +106,14 @@ tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.9.0" tokio-postgres-rustls = "0.9.0"
tokio-rustls = "0.23" tokio-rustls = "0.23"
tokio-stream = "0.1" tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7", features = ["io"] } tokio-util = { version = "0.7", features = ["io"] }
toml = "0.7" toml = "0.7"
toml_edit = "0.19" toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]} tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1" tracing = "0.1"
tracing-error = "0.2.0" tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0" tracing-opentelemetry = "0.18.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] } tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = "2.2" url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] } uuid = { version = "1.2", features = ["v4", "serde"] }
walkdir = "2.3.2" walkdir = "2.3.2"
@@ -143,11 +125,12 @@ env_logger = "0.10"
log = "0.4" log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries ## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -172,7 +155,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
workspace_hack = { version = "0.1", path = "./workspace_hack/" } workspace_hack = { version = "0.1", path = "./workspace_hack/" }
## Build dependencies ## Build dependencies
criterion = "0.5.1" criterion = "0.4"
rcgen = "0.10" rcgen = "0.10"
rstest = "0.17" rstest = "0.17"
tempfile = "3.4" tempfile = "3.4"
@@ -182,7 +165,12 @@ tonic-build = "0.9"
# This is only needed for proxy's tests. # This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead. # TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
# Changes the MAX_THREADS limit from 4096 to 32768.
# This is a temporary workaround for using tracing from many threads in safekeepers code,
# until async safekeepers patch is merged to the main.
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
################# Binary contents sections ################# Binary contents sections

View File

@@ -47,7 +47,8 @@ RUN set -e \
&& mold -run cargo build \ && mold -run cargo build \
--bin pg_sni_router \ --bin pg_sni_router \
--bin pageserver \ --bin pageserver \
--bin pagectl \ --bin pageserver_binutils \
--bin draw_timeline_dir \
--bin safekeeper \ --bin safekeeper \
--bin storage_broker \ --bin storage_broker \
--bin proxy \ --bin proxy \
@@ -72,7 +73,8 @@ RUN set -e \
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin

View File

@@ -2,7 +2,6 @@ ARG PG_VERSION
ARG REPOSITORY=neondatabase ARG REPOSITORY=neondatabase
ARG IMAGE=rust ARG IMAGE=rust
ARG TAG=pinned ARG TAG=pinned
ARG BUILD_TAG
######################################################################################### #########################################################################################
# #
@@ -13,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps
RUN apt update && \ RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd libicu-dev libxslt1-dev liblz4-dev libzstd-dev
######################################################################################### #########################################################################################
# #
@@ -68,7 +67,7 @@ RUN apt update && \
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* / make clean && cp -R /sfcgal/* /
@@ -77,7 +76,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \ echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \ ./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) install && \ make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +88,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
mkdir -p /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \ mkdir build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \ cd build && \
cmake .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \ make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
######################################################################################### #########################################################################################
# #
@@ -144,20 +131,10 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
FROM build-deps AS h3-pg-build FROM build-deps AS h3-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "$(uname -m)" in \ # packaged cmake is too old
"x86_64") \ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
;; \
"aarch64") \
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
;; \
*) \
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
;; \
esac && \
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
-q -O /tmp/cmake-install.sh \ -q -O /tmp/cmake-install.sh \
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \ && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
&& chmod u+x /tmp/cmake-install.sh \ && chmod u+x /tmp/cmake-install.sh \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh && rm /tmp/cmake-install.sh
@@ -211,8 +188,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
FROM build-deps AS vector-pg-build FROM build-deps AS vector-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \ echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -378,7 +355,7 @@ RUN apt-get update && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
cd build && \ cd build && \
make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -431,153 +408,12 @@ RUN apt-get update && \
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ mkdir build && \
mkdir build && cd build && \ cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \ cmake .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \ make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
#########################################################################################
#
# Layer "pg-cron-pg-build"
# compile pg_cron extension
#
#########################################################################################
FROM build-deps AS pg-cron-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
#########################################################################################
#
# Layer "rdkit-pg-build"
# compile rdkit extension
#
#########################################################################################
FROM build-deps AS rdkit-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y \
cmake \
libboost-iostreams1.74-dev \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
-D RDK_BUILD_INCHI_SUPPORT=ON \
-D RDK_BUILD_AVALON_SUPPORT=ON \
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
-D RDK_BUILD_DESCRIPTORS3D=OFF \
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
-D RDK_USE_URF=OFF \
-D RDK_BUILD_PGSQL=ON \
-D RDK_PGSQL_STATIC=ON \
-D PostgreSQL_CONFIG=pg_config \
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
#########################################################################################
#
# Layer "pg-uuidv7-pg-build"
# compile pg_uuidv7 extension
#
#########################################################################################
FROM build-deps AS pg-uuidv7-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
#########################################################################################
#
# Layer "pg-roaringbitmap-pg-build"
# compile pg_roaringbitmap extension
#
#########################################################################################
FROM build-deps AS pg-roaringbitmap-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
#########################################################################################
#
# Layer "pg-embedding-pg-build"
# compile pg_embedding extension
#
#########################################################################################
FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
#########################################################################################
#
# Layer "pg-anon-pg-build"
# compile anon extension
#
#########################################################################################
FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
######################################################################################### #########################################################################################
# #
@@ -664,22 +500,6 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
cargo pgx install --release && \ cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
#
# Layer "pg-pgx-ulid-build"
# Compile "pgx_ulid" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-pgx-ulid-build
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
######################################################################################### #########################################################################################
# #
# Layer "neon-pg-ext-build" # Layer "neon-pg-ext-build"
@@ -687,7 +507,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
# #
######################################################################################### #########################################################################################
FROM build-deps AS neon-pg-ext-build FROM build-deps AS neon-pg-ext-build
# Public extensions
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=postgis-build /sfcgal/* / COPY --from=postgis-build /sfcgal/* /
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -710,12 +529,6 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/ COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \ RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -725,10 +538,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
make -j $(getconf _NPROCESSORS_ONLN) \ make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \ -C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install -s install
######################################################################################### #########################################################################################
@@ -737,9 +546,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
# #
######################################################################################### #########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project # Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . . COPY --chown=nonroot . .
@@ -764,29 +570,6 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries. # if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Extenstion only
#
#########################################################################################
FROM python:3.9-slim-bullseye AS generate-ext-index
ARG PG_VERSION
ARG BUILD_TAG
RUN apt update && apt install -y zstd
# copy the control files here
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
COPY --from=pg-anon-pg-build /extensions/ /extensions/
COPY --from=postgis-build /extensions/ /extensions/
COPY scripts/combine_control_files.py ./combine_control_files.py
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only a couple for testing purposses
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
COPY --from=generate-ext-index /ext_index.json /ext_index.json
######################################################################################### #########################################################################################
# #
# Final layer # Final layer
@@ -815,19 +598,14 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2 # libxml2, libxslt1.1 for xml2
# libzstd1 for zstd # libzstd1 for zstd
# libboost*, libfreetype6, and zlib1g for rdkit
RUN apt update && \ RUN apt update && \
apt install --no-install-recommends -y \ apt install --no-install-recommends -y \
gdb \ gdb \
locales \
libicu67 \ libicu67 \
liblz4-1 \ liblz4-1 \
libreadline8 \ libreadline8 \
libboost-iostreams1.74.0 \
libboost-regex1.74.0 \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \ libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \ libgeos-c1v5 \
libgdal28 \ libgdal28 \
libproj19 \ libproj19 \
@@ -836,10 +614,7 @@ RUN apt update && \
libxml2 \ libxml2 \
libxslt1.1 \ libxslt1.1 \
libzstd1 \ libzstd1 \
libcurl4-openssl-dev \ procps && \
locales \
procps \
zlib1g && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -3,7 +3,6 @@
ARG REPOSITORY=neondatabase ARG REPOSITORY=neondatabase
ARG IMAGE=rust ARG IMAGE=rust
ARG TAG=pinned ARG TAG=pinned
ARG BUILD_TAG
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
WORKDIR /home/nonroot WORKDIR /home/nonroot
@@ -17,8 +16,6 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
ARG CACHEPOT_BUCKET=neon-github-dev ARG CACHEPOT_BUCKET=neon-github-dev
#ARG AWS_ACCESS_KEY_ID #ARG AWS_ACCESS_KEY_ID
#ARG AWS_SECRET_ACCESS_KEY #ARG AWS_SECRET_ACCESS_KEY
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
COPY . . COPY . .

View File

@@ -0,0 +1,70 @@
# Note: this file *mostly* just builds on Dockerfile.compute-node
ARG SRC_IMAGE
ARG VM_INFORMANT_VERSION=v0.1.14
# on libcgroup update, make sure to check bootstrap.sh for changes
ARG LIBCGROUP_VERSION=v2.0.3
# Pull VM informant, to copy from later
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
# Build cgroup-tools
#
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
# requires cgroup v2, so we'll build cgroup-tools ourselves.
FROM debian:bullseye-slim as libcgroup-builder
ARG LIBCGROUP_VERSION
RUN set -exu \
&& apt update \
&& apt install --no-install-recommends -y \
git \
ca-certificates \
automake \
cmake \
make \
gcc \
byacc \
flex \
libtool \
libpam0g-dev \
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
&& INSTALL_DIR="/libcgroup-install" \
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
&& cd libcgroup \
# extracted from bootstrap.sh, with modified flags:
&& (test -d m4 || mkdir m4) \
&& autoreconf -fi \
&& rm -rf autom4te.cache \
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
# actually build the thing...
&& make install
# Combine, starting from non-VM compute node image.
FROM $SRC_IMAGE as base
# Temporarily set user back to root so we can run adduser, set inittab
USER root
RUN adduser vm-informant --disabled-password --no-create-home
RUN set -e \
&& rm -f /etc/inittab \
&& touch /etc/inittab
RUN set -e \
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
USER postgres
ADD vm-cgconfig.conf /etc/cgconfig.conf
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]

View File

@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+@echo "Compiling pageinspect $*" +@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
.PHONY: postgres-clean-% .PHONY: postgres-clean-%
postgres-clean-%: postgres-clean-%:
@@ -140,11 +138,6 @@ neon-pg-ext-%: postgres-%
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
.PHONY: neon-pg-ext-clean-% .PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%: neon-pg-ext-clean-%:
@@ -160,9 +153,6 @@ neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
.PHONY: neon-pg-ext .PHONY: neon-pg-ext
neon-pg-ext: \ neon-pg-ext: \

View File

@@ -17,7 +17,7 @@ The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes. - Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. - Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
## Running local installation ## Running local installation
@@ -28,19 +28,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
* On Ubuntu or Debian, this set of packages should be sufficient to build the code: * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash ```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
libcurl4-openssl-dev
``` ```
* On Fedora, these packages are needed: * On Fedora, these packages are needed:
```bash ```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \ dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel protobuf-devel
``` ```
* On Arch based systems, these packages are needed: * On Arch based systems, these packages are needed:
```bash ```bash
pacman -S base-devel readline zlib libseccomp openssl clang \ pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf curl postgresql-libs cmake postgresql protobuf
``` ```
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -131,31 +130,32 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
```sh ```sh
# Create repository in .neon with proper paths to binaries and data # Create repository in .neon with proper paths to binaries and data
# Later that would be responsibility of a package install script # Later that would be responsibility of a package install script
> cargo neon init > ./target/debug/neon_local init
Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon" Starting pageserver at '127.0.0.1:64000' in '.neon'.
# start pageserver, safekeeper, and broker for their intercommunication # start pageserver, safekeeper, and broker for their intercommunication
> cargo neon start > ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051. Starting neon broker at 127.0.0.1:50051
storage_broker started, pid: 2918372 storage_broker started, pid: 2918372
Starting pageserver node 1 at '127.0.0.1:64000' in ".neon". Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2918386 pageserver started, pid: 2918386
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2918437 safekeeper 1 started, pid: 2918437
# create initial tenant and use it as a default for every future neon_local invocation # create initial tenant and use it as a default for every future neon_local invocation
> cargo neon tenant create --set-default > ./target/debug/neon_local tenant create --set-default
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# start postgres compute node # start postgres compute node
> cargo neon endpoint start main > ./target/debug/neon_local endpoint start main
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
# check list of running postgres instances # check list of running postgres instances
> cargo neon endpoint list > ./target/debug/neon_local endpoint list
ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
``` ```
@@ -177,28 +177,29 @@ postgres=# select * from t;
3. And create branches and run postgres on them: 3. And create branches and run postgres on them:
```sh ```sh
# create branch named migration_check # create branch named migration_check
> cargo neon timeline branch --branch-name migration_check > ./target/debug/neon_local timeline branch --branch-name migration_check
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
# check branches tree # check branches tree
> cargo neon timeline list > ./target/debug/neon_local timeline list
(L) main [de200bd42b49cc1814412c7e592dd6e9] (L) main [de200bd42b49cc1814412c7e592dd6e9]
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
# start postgres on that branch # start postgres on that branch
> cargo neon endpoint start migration_check --branch-name migration_check > ./target/debug/neon_local endpoint start migration_check --branch-name migration_check
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
# check the new list of running postgres instances # check the new list of running postgres instances
> cargo neon endpoint list > ./target/debug/neon_local endpoint list
ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
migration_check 127.0.0.1:55434 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
# this new postgres instance will have all the data from 'main' postgres, # this new postgres instance will have all the data from 'main' postgres,
# but all modifications would not affect data in original postgres # but all modifications would not affect data in original postgres
> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres > psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
postgres=# select * from t; postgres=# select * from t;
key | value key | value
-----+------- -----+-------
@@ -220,7 +221,7 @@ postgres=# select * from t;
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
you have just started. You can terminate them all with one command: you have just started. You can terminate them all with one command:
```sh ```sh
> cargo neon stop > ./target/debug/neon_local stop
``` ```
## Running tests ## Running tests
@@ -237,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make
## Documentation ## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation. [/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. - [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open` To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
@@ -264,6 +265,6 @@ To get more familiar with this aspect, refer to:
## Join the development ## Join the development
- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices. - Read `CONTRIBUTING.md` to learn about project code style and practices.
- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md). - To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md).
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html

View File

@@ -6,10 +6,8 @@ license.workspace = true
[dependencies] [dependencies]
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true
chrono.workspace = true chrono.workspace = true
clap.workspace = true clap.workspace = true
flate2.workspace = true
futures.workspace = true futures.workspace = true
hyper = { workspace = true, features = ["full"] } hyper = { workspace = true, features = ["full"] }
notify.workspace = true notify.workspace = true

View File

@@ -54,20 +54,11 @@ use compute_tools::monitor::launch_monitor;
use compute_tools::params::*; use compute_tools::params::*;
use compute_tools::spec::*; use compute_tools::spec::*;
const BUILD_TAG_DEFAULT: &str = "local";
fn main() -> Result<()> { fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
info!("build_tag: {build_tag}");
let matches = cli().get_matches(); let matches = cli().get_matches();
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches let pgdata = matches
.get_one::<String>("pgdata") .get_one::<String>("pgdata")
.expect("PGDATA path is required"); .expect("PGDATA path is required");
@@ -187,19 +178,11 @@ fn main() -> Result<()> {
// Launch http service first, so we were able to serve control-plane // Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress. // requests, while configuration is still in progress.
let _http_handle = let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
if !spec_set { if !spec_set {
// No spec provided, hang waiting for it. // No spec provided, hang waiting for it.
info!("no compute spec provided, waiting"); info!("no compute spec provided, waiting");
// TODO this can stall startups in the unlikely event that we bind
// this compute node while it's busy prewarming. It's not too
// bad because it's just 100ms and unlikely, but it's an
// avoidable problem.
compute.prewarm_postgres()?;
let mut state = compute.state.lock().unwrap(); let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending { while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap(); state = compute.state_changed.wait(state).unwrap();
@@ -230,8 +213,9 @@ fn main() -> Result<()> {
drop(state); drop(state);
// Launch remaining service threads // Launch remaining service threads
let _monitor_handle = launch_monitor(&compute); let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
let _configurator_handle = launch_configurator(&compute); let _configurator_handle =
launch_configurator(&compute).expect("cannot launch configurator thread");
// Start Postgres // Start Postgres
let mut delay_exit = false; let mut delay_exit = false;
@@ -262,16 +246,6 @@ fn main() -> Result<()> {
exit_code = ecode.code() exit_code = ecode.code()
} }
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
info!("syncing safekeepers on shutdown");
let storage_auth_token = pspec.storage_auth_token.clone();
let lsn = compute.sync_safekeepers(storage_auth_token)?;
info!("synced safekeepers at lsn {lsn}");
}
if let Err(err) = compute.check_for_core_dumps() { if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}"); error!("error while checking for core dumps: {err:?}");
} }
@@ -312,14 +286,6 @@ fn cli() -> clap::Command {
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
clap::Command::new("compute_ctl") clap::Command::new("compute_ctl")
.version(version) .version(version)
.arg(
Arg::new("http-port")
.long("http-port")
.value_name("HTTP_PORT")
.default_value("3080")
.value_parser(clap::value_parser!(u16))
.required(false),
)
.arg( .arg(
Arg::new("connstr") Arg::new("connstr")
.short('C') .short('C')

View File

@@ -1,5 +1,20 @@
//
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
// but there are several things that makes `PostgresNode` usage inconvenient in the
// cloud:
// - it inherits from `LocalEnv`, which contains **all-all** the information about
// a complete service running
// - it uses `PageServerNode` with information about http endpoint, which we do not
// need in the cloud again
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
//
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
// attributes (not required for the cloud). Yet, it is still tempting to unify these
// `PostgresNode` and `ComputeNode` and use one in both places.
//
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
//
use std::fs; use std::fs;
use std::io::BufRead;
use std::os::unix::fs::PermissionsExt; use std::os::unix::fs::PermissionsExt;
use std::path::Path; use std::path::Path;
use std::process::{Command, Stdio}; use std::process::{Command, Stdio};
@@ -8,22 +23,18 @@ use std::sync::{Condvar, Mutex};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
use tokio_postgres; use tokio_postgres;
use tracing::{error, info, instrument, warn}; use tracing::{info, instrument, warn};
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec}; use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use crate::config; use crate::config;
use crate::pg_helpers::*; use crate::pg_helpers::*;
use crate::spec::*; use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
/// Compute node info shared across several `compute_ctl` threads. /// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode { pub struct ComputeNode {
@@ -89,65 +100,36 @@ pub struct ParsedSpec {
pub tenant_id: TenantId, pub tenant_id: TenantId,
pub timeline_id: TimelineId, pub timeline_id: TimelineId,
pub pageserver_connstr: String, pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>, pub storage_auth_token: Option<String>,
} }
impl TryFrom<ComputeSpec> for ParsedSpec { impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = String; type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> { fn try_from(spec: ComputeSpec) -> Result<Self, String> {
// Extract the options from the spec file that are needed to connect to
// the storage system.
//
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec let pageserver_connstr = spec
.pageserver_connstring .cluster
.clone() .settings
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring")) .find("neon.pageserver_connstring")
.ok_or("pageserver connstr should be provided")?; .ok_or("pageserver connstr should be provided")?;
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
.settings
.find("neon.safekeepers")
.ok_or("safekeeper connstrings should be provided")?
.split(',')
.map(|str| str.to_string())
.collect()
} else {
vec![]
}
} else {
spec.safekeeper_connstrings.clone()
};
let storage_auth_token = spec.storage_auth_token.clone(); let storage_auth_token = spec.storage_auth_token.clone();
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { let tenant_id: TenantId = spec
tenant_id .cluster
} else { .settings
spec.cluster .find("neon.tenant_id")
.settings .ok_or("tenant id should be provided")
.find("neon.tenant_id") .map(|s| TenantId::from_str(&s))?
.ok_or("tenant id should be provided") .or(Err("invalid tenant id"))?;
.map(|s| TenantId::from_str(&s))? let timeline_id: TimelineId = spec
.or(Err("invalid tenant id"))? .cluster
}; .settings
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { .find("neon.timeline_id")
timeline_id .ok_or("timeline id should be provided")
} else { .map(|s| TimelineId::from_str(&s))?
spec.cluster .or(Err("invalid timeline id"))?;
.settings
.find("neon.timeline_id")
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?
};
Ok(ParsedSpec { Ok(ParsedSpec {
spec, spec,
pageserver_connstr, pageserver_connstr,
safekeeper_connstrings,
storage_auth_token, storage_auth_token,
tenant_id, tenant_id,
timeline_id, timeline_id,
@@ -155,84 +137,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
} }
} }
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
/// that we give to customers
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let roles = spec
.cluster
.roles
.iter()
.map(|r| escape_literal(&r.name))
.collect::<Vec<_>>();
let dbs = spec
.cluster
.databases
.iter()
.map(|db| escape_literal(&db.name))
.collect::<Vec<_>>();
let roles_decl = if roles.is_empty() {
String::from("roles text[] := NULL;")
} else {
format!(
r#"
roles text[] := ARRAY(SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname IN ({}));"#,
roles.join(", ")
)
};
let database_decl = if dbs.is_empty() {
String::from("dbs text[] := NULL;")
} else {
format!(
r#"
dbs text[] := ARRAY(SELECT datname
FROM pg_catalog.pg_database
WHERE datname IN ({}));"#,
dbs.join(", ")
)
};
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
let query = format!(
r#"
DO $$
DECLARE
r text;
{}
{}
BEGIN
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
FOREACH r IN ARRAY roles LOOP
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
END LOOP;
END IF;
IF array_length(dbs, 1) IS NOT NULL THEN
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
END IF;
END IF;
END
$$;"#,
roles_decl, database_decl,
);
info!("Neon superuser created:\n{}", &query);
client
.simple_query(&query)
.map_err(|e| anyhow::anyhow!(e).context(query))?;
Ok(())
}
impl ComputeNode { impl ComputeNode {
pub fn set_status(&self, status: ComputeStatus) { pub fn set_status(&self, status: ComputeStatus) {
let mut state = self.state.lock().unwrap(); let mut state = self.state.lock().unwrap();
@@ -257,7 +161,7 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and // Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content. // unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))] #[instrument(skip(self, compute_state))]
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Utc::now(); let start_time = Utc::now();
@@ -275,52 +179,20 @@ impl ComputeNode {
let mut client = config.connect(NoTls)?; let mut client = config.connect(NoTls)?;
let basebackup_cmd = match lsn { let basebackup_cmd = match lsn {
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
_ => format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
),
}; };
let copyreader = client.copy_out(basebackup_cmd.as_str())?; let copyreader = client.copy_out(basebackup_cmd.as_str())?;
let mut measured_reader = MeasuredReader::new(copyreader);
// Check the magic number to see if it's a gzip or not. Even though
// we might explicitly ask for gzip, an old pageserver with no implementation
// of gzip compression might send us uncompressed data. After some time
// passes we can assume all pageservers know how to compress and we can
// delete this check.
//
// If the data is not gzip, it will be tar. It will not be mistakenly
// recognized as gzip because tar starts with an ascii encoding of a filename,
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
// we send the "global" directory first from the pageserver, so it definitely
// won't be recognized as gzip.
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
let gzip = {
let peek = bufreader.fill_buf().unwrap();
peek[0] == 0x1f && peek[1] == 0x8b
};
// Read the archive directly from the `CopyOutReader` // Read the archive directly from the `CopyOutReader`
// //
// Set `ignore_zeros` so that unpack() reads all the Copy data and // Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server // doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it. // sends an Error after finishing the tarball, we will not notice it.
if gzip { let mut ar = tar::Archive::new(copyreader);
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true);
ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?;
ar.unpack(&self.pgdata)?;
} else {
let mut ar = tar::Archive::new(&mut bufreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata)?;
};
// Report metrics
self.state.lock().unwrap().metrics.basebackup_bytes =
measured_reader.get_byte_count() as u64;
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now() self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
.signed_duration_since(start_time) .signed_duration_since(start_time)
.to_std() .to_std()
@@ -329,106 +201,10 @@ impl ComputeNode {
Ok(()) Ok(())
} }
pub async fn check_safekeepers_synced_async(
&self,
compute_state: &ComputeState,
) -> Result<Option<Lsn>> {
// Construct a connection config for each safekeeper
let pspec: ParsedSpec = compute_state
.pspec
.as_ref()
.expect("spec must be set")
.clone();
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
// Format connstr
let id = connstr.clone();
let connstr = format!("postgresql://no_user@{}", connstr);
let options = format!(
"-c timeline_id={} tenant_id={}",
pspec.timeline_id, pspec.tenant_id
);
// Construct client
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
config.options(&options);
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
config.password(storage_auth_token);
}
(id, config)
});
// Create task set to query all safekeepers
let mut tasks = FuturesUnordered::new();
let quorum = sk_configs.len() / 2 + 1;
for (id, config) in sk_configs {
let timeout = tokio::time::Duration::from_millis(100);
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
tasks.push(tokio::spawn(task));
}
// Get a quorum of responses or errors
let mut responses = Vec::new();
let mut join_errors = Vec::new();
let mut task_errors = Vec::new();
let mut timeout_errors = Vec::new();
while let Some(response) = tasks.next().await {
match response {
Ok(Ok(Ok(r))) => responses.push(r),
Ok(Ok(Err(e))) => task_errors.push(e),
Ok(Err(e)) => timeout_errors.push(e),
Err(e) => join_errors.push(e),
};
if responses.len() >= quorum {
break;
}
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
break;
}
}
// In case of error, log and fail the check, but don't crash.
// We're playing it safe because these errors could be transient
// and we don't yet retry. Also being careful here allows us to
// be backwards compatible with safekeepers that don't have the
// TIMELINE_STATUS API yet.
if responses.len() < quorum {
error!(
"failed sync safekeepers check {:?} {:?} {:?}",
join_errors, task_errors, timeout_errors
);
return Ok(None);
}
Ok(check_if_synced(responses))
}
// Fast path for sync_safekeepers. If they're already synced we get the lsn
// in one roundtrip. If not, we should do a full sync_safekeepers.
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
let start_time = Utc::now();
// Run actual work with new tokio runtime
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create rt");
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
// Record runtime
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64;
result
}
// Run `postgres` in a special mode with `--sync-safekeepers` argument // Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller. // and return the reported LSN back to the caller.
#[instrument(skip_all)] #[instrument(skip(self, storage_auth_token))]
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> { fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let start_time = Utc::now(); let start_time = Utc::now();
let sync_handle = Command::new(&self.pgbin) let sync_handle = Command::new(&self.pgbin)
@@ -472,7 +248,7 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration, /// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc. /// safekeepers sync, basebackup, etc.
#[instrument(skip_all)] #[instrument(skip(self, compute_state))]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec; let spec = &pspec.spec;
@@ -487,14 +263,10 @@ impl ComputeNode {
// cannot sync safekeepers. // cannot sync safekeepers.
let lsn = match spec.mode { let lsn = match spec.mode {
ComputeMode::Primary => { ComputeMode::Primary => {
info!("checking if safekeepers are synced"); info!("starting safekeepers syncing");
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) { let lsn = self
lsn .sync_safekeepers(pspec.storage_auth_token.clone())
} else { .with_context(|| "failed to sync safekeepers")?;
info!("starting safekeepers syncing");
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("safekeepers synced at LSN {}", lsn); info!("safekeepers synced at LSN {}", lsn);
lsn lsn
} }
@@ -523,8 +295,8 @@ impl ComputeNode {
update_pg_hba(pgdata_path)?; update_pg_hba(pgdata_path)?;
match spec.mode { match spec.mode {
ComputeMode::Primary => {} ComputeMode::Primary | ComputeMode::Static(..) => {}
ComputeMode::Replica | ComputeMode::Static(..) => { ComputeMode::Replica => {
add_standby_signal(pgdata_path)?; add_standby_signal(pgdata_path)?;
} }
} }
@@ -532,53 +304,9 @@ impl ComputeNode {
Ok(()) Ok(())
} }
/// Start and stop a postgres process to warm up the VM for startup.
pub fn prewarm_postgres(&self) -> Result<()> {
info!("prewarming");
// Create pgdata
let pgdata = &format!("{}.warmup", self.pgdata);
create_pgdata(pgdata)?;
// Run initdb to completion
info!("running initdb");
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
Command::new(initdb_bin)
.args(["-D", pgdata])
.output()
.expect("cannot start initdb process");
// Write conf
use std::io::Write;
let conf_path = Path::new(pgdata).join("postgresql.conf");
let mut file = std::fs::File::create(conf_path)?;
writeln!(file, "shared_buffers=65536")?;
writeln!(file, "port=51055")?; // Nobody should be connecting
writeln!(file, "shared_preload_libraries = 'neon'")?;
// Start postgres
info!("starting postgres");
let mut pg = Command::new(&self.pgbin)
.args(["-D", pgdata])
.spawn()
.expect("cannot start postgres process");
// Stop it when it's ready
info!("waiting for postgres");
wait_for_postgres(&mut pg, Path::new(pgdata))?;
pg.kill()?;
info!("sent kill signal");
pg.wait()?;
info!("done prewarming");
// clean up
let _ok = fs::remove_dir_all(pgdata);
Ok(())
}
/// Start Postgres as a child process and manage DBs/roles. /// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit. /// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)] #[instrument(skip(self))]
pub fn start_postgres( pub fn start_postgres(
&self, &self,
storage_auth_token: Option<String>, storage_auth_token: Option<String>,
@@ -602,7 +330,7 @@ impl ComputeNode {
} }
/// Do initial configuration of the already started Postgres. /// Do initial configuration of the already started Postgres.
#[instrument(skip_all)] #[instrument(skip(self, compute_state))]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
// If connection fails, // If connection fails,
// it may be the old node with `zenith_admin` superuser. // it may be the old node with `zenith_admin` superuser.
@@ -623,8 +351,6 @@ impl ComputeNode {
.map_err(|_| anyhow::anyhow!("invalid connstr"))?; .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
// Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client); drop(client);
@@ -635,28 +361,29 @@ impl ComputeNode {
Ok(client) => client, Ok(client) => client,
}; };
// Disable DDL forwarding because control plane already knows about these roles/databases.
client.simple_query("SET neon.forward_ddl = false")?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
create_neon_superuser(spec, &mut client)?;
handle_roles(spec, &mut client)?; handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client)?; handle_databases(spec, &mut client)?;
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, self.connstr.as_str())?; handle_grants(spec, self.connstr.as_str(), &mut client)?;
handle_extensions(spec, &mut client)?; handle_extensions(spec, &mut client)?;
// 'Close' connection // 'Close' connection
drop(client); drop(client);
info!(
"finished configuration of compute for project {}",
spec.cluster.cluster_id
);
Ok(()) Ok(())
} }
// We could've wrapped this around `pg_ctl reload`, but right now we don't use // We could've wrapped this around `pg_ctl reload`, but right now we don't use
// `pg_ctl` for start / stop, so this just seems much easier to do as we already // `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access. // have opened connection to Postgres and superuser access.
#[instrument(skip_all)] #[instrument(skip(self, client))]
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> { fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
client.simple_query("SELECT pg_reload_conf()")?; client.simple_query("SELECT pg_reload_conf()")?;
Ok(()) Ok(())
@@ -664,7 +391,7 @@ impl ComputeNode {
/// Similar to `apply_config()`, but does a bit different sequence of operations, /// Similar to `apply_config()`, but does a bit different sequence of operations,
/// as it's used to reconfigure a previously started and configured Postgres node. /// as it's used to reconfigure a previously started and configured Postgres node.
#[instrument(skip_all)] #[instrument(skip(self))]
pub fn reconfigure(&self) -> Result<()> { pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
@@ -676,13 +403,11 @@ impl ComputeNode {
self.pg_reload_conf(&mut client)?; self.pg_reload_conf(&mut client)?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases.
if spec.mode == ComputeMode::Primary { if spec.mode == ComputeMode::Primary {
client.simple_query("SET neon.forward_ddl = false")?;
handle_roles(&spec, &mut client)?; handle_roles(&spec, &mut client)?;
handle_databases(&spec, &mut client)?; handle_databases(&spec, &mut client)?;
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
handle_grants(&spec, self.connstr.as_str())?; handle_grants(&spec, self.connstr.as_str(), &mut client)?;
handle_extensions(&spec, &mut client)?; handle_extensions(&spec, &mut client)?;
} }
@@ -699,38 +424,33 @@ impl ComputeNode {
Ok(()) Ok(())
} }
#[instrument(skip_all)] #[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> { pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone(); let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = compute_state.pspec.as_ref().expect("spec must be set");
info!( info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}", "starting compute for project {}, operation {}, tenant {}, timeline {}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), spec.spec.cluster.cluster_id,
pspec.spec.operation_uuid.as_deref().unwrap_or("None"), spec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id, spec.tenant_id,
pspec.timeline_id, spec.timeline_id,
); );
self.prepare_pgdata(&compute_state)?; self.prepare_pgdata(&compute_state)?;
let start_time = Utc::now(); let start_time = Utc::now();
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now(); let pg = self.start_postgres(spec.storage_auth_token.clone())?;
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
if spec.spec.mode == ComputeMode::Primary {
self.apply_config(&compute_state)?; self.apply_config(&compute_state)?;
} }
let startup_end_time = Utc::now(); let startup_end_time = Utc::now();
{ {
let mut state = self.state.lock().unwrap(); let mut state = self.state.lock().unwrap();
state.metrics.start_postgres_ms = config_time
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64;
state.metrics.config_ms = startup_end_time state.metrics.config_ms = startup_end_time
.signed_duration_since(config_time) .signed_duration_since(start_time)
.to_std() .to_std()
.unwrap() .unwrap()
.as_millis() as u64; .as_millis() as u64;
@@ -742,18 +462,6 @@ impl ComputeNode {
} }
self.set_status(ComputeStatus::Running); self.set_status(ComputeStatus::Running);
info!(
"finished configuration of compute for project {}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
);
// Log metrics so that we can search for slow operations in logs
let metrics = {
let state = self.state.lock().unwrap();
state.metrics.clone()
};
info!(?metrics, "compute start finished");
Ok(pg) Ok(pg)
} }

View File

@@ -5,7 +5,6 @@ use std::path::Path;
use anyhow::Result; use anyhow::Result;
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::PgOptionsSerialize; use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec}; use compute_api::spec::{ComputeMode, ComputeSpec};
@@ -37,36 +36,10 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
// File::create() destroys the file content if it exists. // File::create() destroys the file content if it exists.
let mut file = File::create(path)?; let mut file = File::create(path)?;
// Write the postgresql.conf content from the spec file as is. writeln!(file, "# Managed by compute_ctl: begin")?;
if let Some(conf) = &spec.cluster.postgresql_conf {
writeln!(file, "{}", conf)?;
}
write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,
"neon.safekeepers={}",
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {
writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
}
if let Some(s) = &spec.timeline_id {
writeln!(
file,
"neon.timeline_id={}",
escape_conf_value(&s.to_string())
)?;
}
match spec.mode { match spec.mode {
ComputeMode::Primary => {} ComputeMode::Primary => {}
ComputeMode::Static(lsn) => { ComputeMode::Static(lsn) => {
@@ -80,12 +53,7 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
} }
} }
// If there are any extra options in the 'settings' field, append those writeln!(file, "# Managed by compute_ctl: end")?;
if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?;
write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
writeln!(file, "# Managed by compute_ctl: end")?;
}
Ok(()) Ok(())
} }

View File

@@ -1,13 +1,14 @@
use std::sync::Arc; use std::sync::Arc;
use std::thread; use std::thread;
use anyhow::Result;
use tracing::{error, info, instrument}; use tracing::{error, info, instrument};
use compute_api::responses::ComputeStatus; use compute_api::responses::ComputeStatus;
use crate::compute::ComputeNode; use crate::compute::ComputeNode;
#[instrument(skip_all)] #[instrument(skip(compute))]
fn configurator_main_loop(compute: &Arc<ComputeNode>) { fn configurator_main_loop(compute: &Arc<ComputeNode>) {
info!("waiting for reconfiguration requests"); info!("waiting for reconfiguration requests");
loop { loop {
@@ -41,14 +42,13 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
} }
} }
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> { pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let compute = Arc::clone(compute); let compute = Arc::clone(compute);
thread::Builder::new() Ok(thread::Builder::new()
.name("compute-configurator".into()) .name("compute-configurator".into())
.spawn(move || { .spawn(move || {
configurator_main_loop(&compute); configurator_main_loop(&compute);
info!("configurator thread is exited"); info!("configurator thread is exited");
}) })?)
.expect("cannot launch configurator thread")
} }

View File

@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
// Main Hyper HTTP server function that runs it and blocks waiting on it forever. // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main] #[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) { async fn serve(state: Arc<ComputeNode>) {
let addr = SocketAddr::from(([0, 0, 0, 0], port)); let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
let make_service = make_service_fn(move |_conn| { let make_service = make_service_fn(move |_conn| {
let state = state.clone(); let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
} }
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> { pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state); let state = Arc::clone(state);
Ok(thread::Builder::new() Ok(thread::Builder::new()
.name("http-endpoint".into()) .name("http-endpoint".into())
.spawn(move || serve(port, state))?) .spawn(move || serve(state))?)
} }

View File

@@ -13,4 +13,3 @@ pub mod monitor;
pub mod params; pub mod params;
pub mod pg_helpers; pub mod pg_helpers;
pub mod spec; pub mod spec;
pub mod sync_sk;

View File

@@ -18,7 +18,6 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
let fmt_layer = tracing_subscriber::fmt::layer() let fmt_layer = tracing_subscriber::fmt::layer()
.with_ansi(false)
.with_target(false) .with_target(false)
.with_writer(std::io::stderr); .with_writer(std::io::stderr);
@@ -34,7 +33,5 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
.init(); .init();
tracing::info!("logging and tracing started"); tracing::info!("logging and tracing started");
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
Ok(()) Ok(())
} }

View File

@@ -1,6 +1,7 @@
use std::sync::Arc; use std::sync::Arc;
use std::{thread, time}; use std::{thread, time};
use anyhow::Result;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
use tracing::{debug, info}; use tracing::{debug, info};
@@ -104,11 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
} }
/// Launch a separate compute monitor thread and return its `JoinHandle`. /// Launch a separate compute monitor thread and return its `JoinHandle`.
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> { pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state); let state = Arc::clone(state);
thread::Builder::new() Ok(thread::Builder::new()
.name("compute-monitor".into()) .name("compute-monitor".into())
.spawn(move || watch_compute_activity(&state)) .spawn(move || watch_compute_activity(&state))?)
.expect("cannot launch compute monitor thread")
} }

View File

@@ -16,26 +16,15 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
/// Escape a string for including it in a SQL literal. Wrapping the result /// Escape a string for including it in a SQL literal
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use fn escape_literal(s: &str) -> String {
/// SQL string literal, e.g. `'db'''` or `E'db\\'`. s.replace('\'', "''").replace('\\', "\\\\")
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
/// for the original implementation.
pub fn escape_literal(s: &str) -> String {
let res = s.replace('\'', "''").replace('\\', "\\\\");
if res.contains('\\') {
format!("E'{}'", res)
} else {
format!("'{}'", res)
}
} }
/// Escape a string so that it can be used in postgresql.conf. Wrapping the result /// Escape a string so that it can be used in postgresql.conf.
/// with `'{}'` is not required, as it returns a ready-to-use config string. /// Same as escape_literal, currently.
pub fn escape_conf_value(s: &str) -> String { fn escape_conf_value(s: &str) -> String {
let res = s.replace('\'', "''").replace('\\', "\\\\"); s.replace('\'', "''").replace('\\', "\\\\")
format!("'{}'", res)
} }
trait GenericOptionExt { trait GenericOptionExt {
@@ -48,7 +37,7 @@ impl GenericOptionExt for GenericOption {
fn to_pg_option(&self) -> String { fn to_pg_option(&self) -> String {
if let Some(val) = &self.value { if let Some(val) = &self.value {
match self.vartype.as_ref() { match self.vartype.as_ref() {
"string" => format!("{} {}", self.name, escape_literal(val)), "string" => format!("{} '{}'", self.name, escape_literal(val)),
_ => format!("{} {}", self.name, val), _ => format!("{} {}", self.name, val),
} }
} else { } else {
@@ -60,7 +49,7 @@ impl GenericOptionExt for GenericOption {
fn to_pg_setting(&self) -> String { fn to_pg_setting(&self) -> String {
if let Some(val) = &self.value { if let Some(val) = &self.value {
match self.vartype.as_ref() { match self.vartype.as_ref() {
"string" => format!("{} = {}", self.name, escape_conf_value(val)), "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
_ => format!("{} = {}", self.name, val), _ => format!("{} = {}", self.name, val),
} }
} else { } else {
@@ -132,8 +121,9 @@ impl RoleExt for Role {
/// string of arguments. /// string of arguments.
fn to_pg_options(&self) -> String { fn to_pg_options(&self) -> String {
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
let mut params: String = self.options.as_pg_options(); // For now, we do not use generic `options` for roles. Once used, add
params.push_str(" LOGIN"); // `self.options.as_pg_options()` somewhere here.
let mut params: String = "LOGIN".to_string();
if let Some(pass) = &self.encrypted_password { if let Some(pass) = &self.encrypted_password {
// Some time ago we supported only md5 and treated all encrypted_password as md5. // Some time ago we supported only md5 and treated all encrypted_password as md5.
@@ -226,7 +216,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
/// Wait for Postgres to become ready to accept connections. It's ready to /// Wait for Postgres to become ready to accept connections. It's ready to
/// accept connections when the state-field in `pgdata/postmaster.pid` says /// accept connections when the state-field in `pgdata/postmaster.pid` says
/// 'ready'. /// 'ready'.
#[instrument(skip_all, fields(pgdata = %pgdata.display()))] #[instrument(skip(pg))]
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
let pid_path = pgdata.join("postmaster.pid"); let pid_path = pgdata.join("postmaster.pid");

View File

@@ -62,7 +62,7 @@ fn do_control_plane_request(
} }
} }
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN` /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
/// env variable is set, it will be used for authorization. /// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane( pub fn get_spec_from_control_plane(
base_uri: &str, base_uri: &str,
@@ -269,13 +269,17 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
xact.execute(query.as_str(), &[])?; xact.execute(query.as_str(), &[])?;
} }
RoleAction::Create => { RoleAction::Create => {
let mut query: String = format!( let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
name.pg_quote()
);
info!("role create query: '{}'", &query); info!("role create query: '{}'", &query);
query.push_str(&role.to_pg_options()); query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?; xact.execute(query.as_str(), &[])?;
let grant_query = format!(
"GRANT pg_read_all_data, pg_write_all_data TO {}",
name.pg_quote()
);
xact.execute(grant_query.as_str(), &[])?;
info!("role grant query: '{}'", &grant_query);
} }
} }
@@ -397,44 +401,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
// We do not check either DB exists or not, // We do not check either DB exists or not,
// Postgres will take care of it for us // Postgres will take care of it for us
"delete_db" => { "delete_db" => {
// In Postgres we can't drop a database if it is a template. let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
"
DO $$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {}
)
THEN
ALTER DATABASE {} is_template false;
END IF;
END
$$;",
escape_literal(&op.name),
&op.name.pg_quote()
);
// Use FORCE to drop database even if there are active connections.
// We run this from `cloud_admin`, so it should have enough privileges.
// NB: there could be other db states, which prevent us from dropping
// the database. For example, if db is used by any active subscription
// or replication slot.
// TODO: deal with it once we allow logical replication. Proper fix should
// involve returning an error code to the control plane, so it could
// figure out that this is a non-retryable error, return it to the user
// and fail operation permanently.
let drop_db_query: String = format!(
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
&op.name.pg_quote()
);
warn!("deleting database '{}'", &op.name); warn!("deleting database '{}'", &op.name);
client.execute(unset_template_query.as_str(), &[])?; client.execute(query.as_str(), &[])?;
client.execute(drop_db_query.as_str(), &[])?;
} }
"rename_db" => { "rename_db" => {
let new_name = op.new_name.as_ref().unwrap(); let new_name = op.new_name.as_ref().unwrap();
@@ -506,11 +476,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
query.push_str(&db.to_pg_options()); query.push_str(&db.to_pg_options());
let _guard = info_span!("executing", query).entered(); let _guard = info_span!("executing", query).entered();
client.execute(query.as_str(), &[])?; client.execute(query.as_str(), &[])?;
let grant_query: String = format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
name.pg_quote()
);
client.execute(grant_query.as_str(), &[])?;
} }
}; };
@@ -530,9 +495,35 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example. /// to allow users creating trusted extensions and re-creating `public` schema, for example.
#[instrument(skip_all)] #[instrument(skip_all)]
pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> { pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
info!("cluster spec grants:"); info!("cluster spec grants:");
// We now have a separate `web_access` role to connect to the database
// via the web interface and proxy link auth. And also we grant a
// read / write all data privilege to every role. So also grant
// create to everyone.
// XXX: later we should stop messing with Postgres ACL in such horrible
// ways.
let roles = spec
.cluster
.roles
.iter()
.map(|r| r.name.pg_quote())
.collect::<Vec<_>>();
for db in &spec.cluster.databases {
let dbname = &db.name;
let query: String = format!(
"GRANT CREATE ON DATABASE {} TO {}",
dbname.pg_quote(),
roles.join(", ")
);
info!("grant query {}", &query);
client.execute(query.as_str(), &[])?;
}
// Do some per-database access adjustments. We'd better do this at db creation time, // Do some per-database access adjustments. We'd better do this at db creation time,
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
// atomically. // atomically.

View File

@@ -1,98 +0,0 @@
// Utils for running sync_safekeepers
use anyhow::Result;
use tracing::info;
use utils::lsn::Lsn;
#[derive(Copy, Clone, Debug)]
pub enum TimelineStatusResponse {
NotFound,
Ok(TimelineStatusOkResponse),
}
#[derive(Copy, Clone, Debug)]
pub struct TimelineStatusOkResponse {
flush_lsn: Lsn,
commit_lsn: Lsn,
}
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
pub async fn ping_safekeeper(
id: String,
config: tokio_postgres::Config,
) -> Result<TimelineStatusResponse> {
// TODO add retries
// Connect
info!("connecting to {}", id);
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
// Query
info!("querying {}", id);
let result = client.simple_query("TIMELINE_STATUS").await?;
// Parse result
info!("done with {}", id);
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
use std::str::FromStr;
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
});
Ok(response)
} else {
// Timeline doesn't exist
Ok(TimelineStatusResponse::NotFound)
}
}
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
// Check if all responses are ok
let ok_responses: Vec<TimelineStatusOkResponse> = responses
.iter()
.filter_map(|r| match r {
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
_ => None,
})
.cloned()
.collect();
if ok_responses.len() < responses.len() {
info!(
"not synced. Only {} out of {} know about this timeline",
ok_responses.len(),
responses.len()
);
return None;
}
// Get the min and the max of everything
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
let commit_max = commit.iter().max().unwrap();
let commit_min = commit.iter().min().unwrap();
let flush_max = flush.iter().max().unwrap();
let flush_min = flush.iter().min().unwrap();
// Check that all values are equal
if commit_min != commit_max {
info!("not synced. {:?} {:?}", commit_min, commit_max);
return None;
}
if flush_min != flush_max {
info!("not synced. {:?} {:?}", flush_min, flush_max);
return None;
}
// Check that commit == flush
if commit_max != flush_max {
info!("not synced. {:?} {:?}", commit_max, flush_max);
return None;
}
Some(*commit_max)
}

View File

@@ -16,7 +16,7 @@ mod pg_helpers_tests {
); );
assert_eq!( assert_eq!(
spec.cluster.roles.first().unwrap().to_pg_options(), spec.cluster.roles.first().unwrap().to_pg_options(),
" LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
); );
} }
@@ -89,12 +89,4 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(none_generic_options.find("missed_value"), None); assert_eq!(none_generic_options.find("missed_value"), None);
assert_eq!(none_generic_options.find("invalid_value"), None); assert_eq!(none_generic_options.find("invalid_value"), None);
} }
#[test]
fn test_escape_literal() {
assert_eq!(escape_literal("test"), "'test'");
assert_eq!(escape_literal("test'"), "'test'''");
assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
}
} }

View File

@@ -10,7 +10,7 @@
//! (non-Neon binaries don't necessarily follow our pidfile conventions). //! (non-Neon binaries don't necessarily follow our pidfile conventions).
//! The pid stored in the file is later used to stop the service. //! The pid stored in the file is later used to stop the service.
//! //!
//! See the [`lock_file`](utils::lock_file) module for more info. //! See [`lock_file`] module for more info.
use std::ffi::OsStr; use std::ffi::OsStr;
use std::io::Write; use std::io::Write;
@@ -30,12 +30,12 @@ use utils::pid_file::{self, PidFileRead};
// These constants control the loop used to poll for process start / stop. // These constants control the loop used to poll for process start / stop.
// //
// The loop waits for at most 10 seconds, polling every 100 ms. // The loop waits for at most 20 seconds, polling every 100 ms.
// Once a second, it prints a dot ("."), to give the user an indication that // Once a second, it prints a dot ("."), to give the user an indication that
// it's waiting. If the process hasn't started/stopped after 5 seconds, // it's waiting. If the process hasn't started/stopped after 5 seconds,
// it prints a notice that it's taking long, but keeps waiting. // it prints a notice that it's taking long, but keeps waiting.
// //
const RETRY_UNTIL_SECS: u64 = 10; const RETRY_UNTIL_SECS: u64 = 20;
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS; const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
const RETRY_INTERVAL_MILLIS: u64 = 100; const RETRY_INTERVAL_MILLIS: u64 = 100;
const DOT_EVERY_RETRIES: u64 = 10; const DOT_EVERY_RETRIES: u64 = 10;
@@ -180,11 +180,6 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
} }
// Wait until process is gone // Wait until process is gone
wait_until_stopped(process_name, pid)?;
Ok(())
}
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
for retries in 0..RETRIES { for retries in 0..RETRIES {
match process_has_stopped(pid) { match process_has_stopped(pid) {
Ok(true) => { Ok(true) => {

View File

@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main"; const DEFAULT_BRANCH_NAME: &str = "main";
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "15"; const DEFAULT_PG_VERSION: &str = "14";
fn default_conf() -> String { fn default_conf() -> String {
format!( format!(
@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let mut env = let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_flag("force"); env.init(pg_version)
env.init(pg_version, force)
.context("Failed to initialize neon repository")?; .context("Failed to initialize neon repository")?;
// Initialize pageserver, create initial tenant and timeline. // Initialize pageserver, create initial tenant and timeline.
@@ -477,11 +476,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
println!("Creating endpoint for imported timeline ..."); println!("Creating endpoint for imported timeline ...");
cplane.new_endpoint( cplane.new_endpoint(
name,
tenant_id, tenant_id,
name,
timeline_id, timeline_id,
None, None,
None,
pg_version, pg_version,
ComputeMode::Primary, ComputeMode::Primary,
)?; )?;
@@ -593,7 +591,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
table.add_row([ table.add_row([
endpoint_id.as_str(), endpoint_id.as_str(),
&endpoint.pg_address.to_string(), &endpoint.address.to_string(),
&endpoint.timeline_id.to_string(), &endpoint.timeline_id.to_string(),
branch_name, branch_name,
lsn_str.as_str(), lsn_str.as_str(),
@@ -622,8 +620,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_branch_timeline_id(branch_name, tenant_id) .get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied(); let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let pg_version = sub_args let pg_version = sub_args
.get_one::<u32>("pg-version") .get_one::<u32>("pg-version")
.copied() .copied()
@@ -641,38 +639,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
}; };
cplane.new_endpoint( cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
&endpoint_id,
tenant_id,
timeline_id,
pg_port,
http_port,
pg_version,
mode,
)?;
} }
"start" => { "start" => {
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied(); let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let endpoint_id = sub_args let endpoint_id = sub_args
.get_one::<String>("endpoint_id") .get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
let mut safekeepers: Vec<NodeId> = Vec::new();
for sk_id in safekeepers_str.split(',').map(str::trim) {
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
})?);
safekeepers.push(sk_id);
}
safekeepers
} else {
env.safekeepers.iter().map(|sk| sk.id).collect()
};
let endpoint = cplane.endpoints.get(endpoint_id.as_str()); let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -699,7 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {} _ => {}
} }
println!("Starting existing endpoint {endpoint_id}..."); println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers)?; endpoint.start(&auth_token)?;
} else { } else {
let branch_name = sub_args let branch_name = sub_args
.get_one::<String>("branch-name") .get_one::<String>("branch-name")
@@ -735,15 +709,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
let ep = cplane.new_endpoint( let ep = cplane.new_endpoint(
endpoint_id,
tenant_id, tenant_id,
endpoint_id,
timeline_id, timeline_id,
pg_port, port,
http_port,
pg_version, pg_version,
mode, mode,
)?; )?;
ep.start(&auth_token, safekeepers)?; ep.start(&auth_token)?;
} }
} }
"stop" => { "stop" => {
@@ -971,22 +944,11 @@ fn cli() -> Command {
.value_parser(value_parser!(u32)) .value_parser(value_parser!(u32))
.default_value(DEFAULT_PG_VERSION); .default_value(DEFAULT_PG_VERSION);
let pg_port_arg = Arg::new("pg-port") let port_arg = Arg::new("port")
.long("pg-port") .long("port")
.required(false) .required(false)
.value_parser(value_parser!(u16)) .value_parser(value_parser!(u16))
.value_name("pg-port"); .value_name("port");
let http_port_arg = Arg::new("http-port")
.long("http-port")
.required(false)
.value_parser(value_parser!(u16))
.value_name("http-port");
let safekeepers_arg = Arg::new("safekeepers")
.long("safekeepers")
.required(false)
.value_name("safekeepers");
let stop_mode_arg = Arg::new("stop-mode") let stop_mode_arg = Arg::new("stop-mode")
.short('m') .short('m')
@@ -1014,13 +976,6 @@ fn cli() -> Command {
.help("If set, the node will be a hot replica on the specified timeline") .help("If set, the node will be a hot replica on the specified timeline")
.required(false); .required(false);
let force_arg = Arg::new("force")
.value_parser(value_parser!(bool))
.long("force")
.action(ArgAction::SetTrue)
.help("Force initialization even if the repository is not empty")
.required(false);
Command::new("Neon CLI") Command::new("Neon CLI")
.arg_required_else_help(true) .arg_required_else_help(true)
.version(GIT_VERSION) .version(GIT_VERSION)
@@ -1036,7 +991,6 @@ fn cli() -> Command {
.value_name("config"), .value_name("config"),
) )
.arg(pg_version_arg.clone()) .arg(pg_version_arg.clone())
.arg(force_arg)
) )
.subcommand( .subcommand(
Command::new("timeline") Command::new("timeline")
@@ -1139,8 +1093,7 @@ fn cli() -> Command {
.arg(branch_name_arg.clone()) .arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone()) .arg(tenant_id_arg.clone())
.arg(lsn_arg.clone()) .arg(lsn_arg.clone())
.arg(pg_port_arg.clone()) .arg(port_arg.clone())
.arg(http_port_arg.clone())
.arg( .arg(
Arg::new("config-only") Arg::new("config-only")
.help("Don't do basebackup, create endpoint directory with only config files") .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1156,11 +1109,9 @@ fn cli() -> Command {
.arg(branch_name_arg) .arg(branch_name_arg)
.arg(timeline_id_arg) .arg(timeline_id_arg)
.arg(lsn_arg) .arg(lsn_arg)
.arg(pg_port_arg) .arg(port_arg)
.arg(http_port_arg)
.arg(pg_version_arg) .arg(pg_version_arg)
.arg(hot_standby_arg) .arg(hot_standby_arg)
.arg(safekeepers_arg)
) )
.subcommand( .subcommand(
Command::new("stop") Command::new("stop")

View File

@@ -1,10 +1,3 @@
//! Code to manage the storage broker
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
use anyhow::Context; use anyhow::Context;
use std::path::PathBuf; use std::path::PathBuf;

View File

@@ -1,75 +1,41 @@
//! Code to manage compute endpoints
//!
//! In the local test environment, the data for each endpoint is stored in
//!
//! ```text
//! .neon/endpoints/<endpoint id>
//! ```
//!
//! Some basic information about the endpoint, like the tenant and timeline IDs,
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
//! when the endpoint is created, and doesn't change afterwards.
//!
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
//! the basebackup from the pageserver to initialize the the data directory, and
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
//! until it exits.
//!
//! When an endpoint is created, a `postgresql.conf` file is also created in
//! the endpoint's directory. The file can be modified before starting PostgreSQL.
//! However, the `postgresql.conf` file in the endpoint directory is not used directly
//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
//! copy of it in the data directory.
//!
//! Directory contents:
//!
//! ```text
//! .neon/endpoints/main/
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
//! postgresql.conf - postgresql settings
//! spec.json - passed to `compute_ctl`
//! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal
//! <other PostgreSQL files>
//! ```
//!
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::fs::{self, File};
use std::io::Write;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::net::TcpStream; use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf; use std::path::PathBuf;
use std::process::Command; use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{Context, Result};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr}; use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId, TimelineId}; use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::local_env::LocalEnv; use crate::local_env::LocalEnv;
use crate::pageserver::PageServerNode; use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf; use crate::postgresql_conf::PostgresConf;
use compute_api::responses::{ComputeState, ComputeStatus}; use compute_api::spec::ComputeMode;
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file // contents of a endpoint.json file
#[serde_as] #[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf { pub struct EndpointConf {
endpoint_id: String, name: String,
#[serde_as(as = "DisplayFromStr")] #[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId, tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")] #[serde_as(as = "DisplayFromStr")]
timeline_id: TimelineId, timeline_id: TimelineId,
mode: ComputeMode, mode: ComputeMode,
pg_port: u16, port: u16,
http_port: u16,
pg_version: u32, pg_version: u32,
skip_pg_catalog_updates: bool,
} }
// //
@@ -91,11 +57,11 @@ impl ComputeControlPlane {
let pageserver = Arc::new(PageServerNode::from_env(&env)); let pageserver = Arc::new(PageServerNode::from_env(&env));
let mut endpoints = BTreeMap::default(); let mut endpoints = BTreeMap::default();
for endpoint_dir in std::fs::read_dir(env.endpoints_path()) for endpoint_dir in fs::read_dir(env.endpoints_path())
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))? .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
{ {
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); endpoints.insert(ep.name.clone(), Arc::new(ep));
} }
Ok(ComputeControlPlane { Ok(ComputeControlPlane {
@@ -110,58 +76,47 @@ impl ComputeControlPlane {
1 + self 1 + self
.endpoints .endpoints
.values() .values()
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) .map(|ep| ep.address.port())
.max() .max()
.unwrap_or(self.base_port) .unwrap_or(self.base_port)
} }
#[allow(clippy::too_many_arguments)]
pub fn new_endpoint( pub fn new_endpoint(
&mut self, &mut self,
endpoint_id: &str,
tenant_id: TenantId, tenant_id: TenantId,
name: &str,
timeline_id: TimelineId, timeline_id: TimelineId,
pg_port: Option<u16>, port: Option<u16>,
http_port: Option<u16>,
pg_version: u32, pg_version: u32,
mode: ComputeMode, mode: ComputeMode,
) -> Result<Arc<Endpoint>> { ) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let port = port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let ep = Arc::new(Endpoint { let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(), name: name.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
env: self.env.clone(), env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver), pageserver: Arc::clone(&self.pageserver),
timeline_id, timeline_id,
mode, mode,
tenant_id, tenant_id,
pg_version, pg_version,
skip_pg_catalog_updates: false,
}); });
ep.create_pgdata()?;
ep.create_endpoint_dir()?;
std::fs::write( std::fs::write(
ep.endpoint_path().join("endpoint.json"), ep.endpoint_path().join("endpoint.json"),
serde_json::to_string_pretty(&EndpointConf { serde_json::to_string_pretty(&EndpointConf {
endpoint_id: endpoint_id.to_string(), name: name.to_string(),
tenant_id, tenant_id,
timeline_id, timeline_id,
mode, mode,
http_port, port,
pg_port,
pg_version, pg_version,
skip_pg_catalog_updates: false,
})?, })?,
)?; )?;
std::fs::write( ep.setup_pg_conf()?;
ep.endpoint_path().join("postgresql.conf"),
ep.setup_pg_conf()?.to_string(),
)?;
self.endpoints self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
.insert(ep.endpoint_id.clone(), Arc::clone(&ep));
Ok(ep) Ok(ep)
} }
@@ -172,25 +127,19 @@ impl ComputeControlPlane {
#[derive(Debug)] #[derive(Debug)]
pub struct Endpoint { pub struct Endpoint {
/// used as the directory name /// used as the directory name
endpoint_id: String, name: String,
pub tenant_id: TenantId, pub tenant_id: TenantId,
pub timeline_id: TimelineId, pub timeline_id: TimelineId,
pub mode: ComputeMode, pub mode: ComputeMode,
// port and address of the Postgres server and `compute_ctl`'s HTTP API // port and address of the Postgres server
pub pg_address: SocketAddr, pub address: SocketAddr,
pub http_address: SocketAddr,
// postgres major version in the format: 14, 15, etc.
pg_version: u32, pg_version: u32,
// These are not part of the endpoint as such, but the environment // These are not part of the endpoint as such, but the environment
// the endpoint runs in. // the endpoint runs in.
pub env: LocalEnv, pub env: LocalEnv,
pageserver: Arc<PageServerNode>, pageserver: Arc<PageServerNode>,
// Optimizations
skip_pg_catalog_updates: bool,
} }
impl Endpoint { impl Endpoint {
@@ -208,37 +157,123 @@ impl Endpoint {
// parse data directory name // parse data directory name
let fname = entry.file_name(); let fname = entry.file_name();
let endpoint_id = fname.to_str().unwrap().to_string(); let name = fname.to_str().unwrap().to_string();
// Read the endpoint.json file // Read the endpoint.json file
let conf: EndpointConf = let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
// ok now
Ok(Endpoint { Ok(Endpoint {
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), name,
endpoint_id,
env: env.clone(), env: env.clone(),
pageserver: Arc::clone(pageserver), pageserver: Arc::clone(pageserver),
timeline_id: conf.timeline_id, timeline_id: conf.timeline_id,
mode: conf.mode, mode: conf.mode,
tenant_id: conf.tenant_id, tenant_id: conf.tenant_id,
pg_version: conf.pg_version, pg_version: conf.pg_version,
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
}) })
} }
fn create_endpoint_dir(&self) -> Result<()> { fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
std::fs::create_dir_all(self.endpoint_path()).with_context(|| { let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
format!( let mut cmd = Command::new(pg_path);
"could not create endpoint directory {}",
self.endpoint_path().display() cmd.arg("--sync-safekeepers")
.env_clear()
.env(
"LD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
) )
}) .env(
"DYLD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
)
.env("PGDATA", self.pgdata().to_str().unwrap())
.stdout(Stdio::piped())
// Comment this to avoid capturing stderr (useful if command hangs)
.stderr(Stdio::piped());
if let Some(token) = auth_token {
cmd.env("NEON_AUTH_TOKEN", token);
}
let sync_handle = cmd
.spawn()
.expect("postgres --sync-safekeepers failed to start");
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"sync-safekeepers failed: '{}'",
String::from_utf8_lossy(&sync_output.stderr)
);
}
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
println!("Safekeepers synced on {}", lsn);
Ok(lsn)
} }
// Generate postgresql.conf with default configuration /// Get basebackup from the pageserver as a tar archive and extract it
fn setup_pg_conf(&self) -> Result<PostgresConf> { /// to the `self.pgdata()` directory.
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
println!(
"Extracting base backup to create postgres instance: path={} port={}",
self.pgdata().display(),
self.address.port()
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
} else {
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
};
let mut client = self
.pageserver
.page_server_psql_client()
.context("connecting to page server failed")?;
let copyreader = client
.copy_out(sql.as_str())
.context("page server 'basebackup' command failed")?;
// Read the archive directly from the `CopyOutReader`
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
let mut ar = tar::Archive::new(copyreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata())
.context("extracting base backup failed")?;
Ok(())
}
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(self.pgdata()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
)
})?;
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
.with_context(|| {
format!(
"could not set permissions in data directory {}",
self.pgdata().display()
)
})
}
// Write postgresql.conf with default configuration
// and PG_VERSION file to the data directory of a new endpoint.
fn setup_pg_conf(&self) -> Result<()> {
let mut conf = PostgresConf::new(); let mut conf = PostgresConf::new();
conf.append("max_wal_senders", "10"); conf.append("max_wal_senders", "10");
conf.append("wal_log_hints", "off"); conf.append("wal_log_hints", "off");
@@ -251,14 +286,25 @@ impl Endpoint {
// wal_sender_timeout is the maximum time to wait for WAL replication. // wal_sender_timeout is the maximum time to wait for WAL replication.
// It also defines how often the walreciever will send a feedback message to the wal sender. // It also defines how often the walreciever will send a feedback message to the wal sender.
conf.append("wal_sender_timeout", "5s"); conf.append("wal_sender_timeout", "5s");
conf.append("listen_addresses", &self.pg_address.ip().to_string()); conf.append("listen_addresses", &self.address.ip().to_string());
conf.append("port", &self.pg_address.port().to_string()); conf.append("port", &self.address.port().to_string());
conf.append("wal_keep_size", "0"); conf.append("wal_keep_size", "0");
// walproposer panics when basebackup is invalid, it is pointless to restart in this case. // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
conf.append("restart_after_crash", "off"); conf.append("restart_after_crash", "off");
// Load the 'neon' extension // Configure the Neon Postgres extension to fetch pages from pageserver
let pageserver_connstr = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
conf.append("shared_preload_libraries", "neon"); conf.append("shared_preload_libraries", "neon");
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
conf.append_line(""); conf.append_line("");
// Replication-related configurations, such as WAL sending // Replication-related configurations, such as WAL sending
@@ -289,7 +335,7 @@ impl Endpoint {
.env .env
.safekeepers .safekeepers
.iter() .iter()
.map(|sk| format!("localhost:{}", sk.get_compute_port())) .map(|sk| format!("localhost:{}", sk.pg_port))
.collect::<Vec<String>>() .collect::<Vec<String>>()
.join(","); .join(",");
conf.append("neon.safekeepers", &safekeepers); conf.append("neon.safekeepers", &safekeepers);
@@ -318,7 +364,7 @@ impl Endpoint {
.env .env
.safekeepers .safekeepers
.iter() .iter()
.map(|x| x.get_compute_port().to_string()) .map(|x| x.pg_port.to_string())
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(","); .join(",");
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(","); let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -335,19 +381,49 @@ impl Endpoint {
conf.append("primary_conninfo", connstr.as_str()); conf.append("primary_conninfo", connstr.as_str());
conf.append("primary_slot_name", slot_name.as_str()); conf.append("primary_slot_name", slot_name.as_str());
conf.append("hot_standby", "on"); conf.append("hot_standby", "on");
// prefetching of blocks referenced in WAL doesn't make sense for us
// Neon hot standby ignores pages that are not in the shared_buffers
if self.pg_version >= 15 {
conf.append("recovery_prefetch", "off");
}
} }
} }
Ok(conf) let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
file.write_all(conf.to_string().as_bytes())?;
let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
file.write_all(self.pg_version.to_string().as_bytes())?;
Ok(())
}
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
let backup_lsn = match &self.mode {
ComputeMode::Primary => {
if !self.env.safekeepers.is_empty() {
// LSN 0 means that it is bootstrap and we need to download just
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
// when things would be more stable (TODO).
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
if lsn == Lsn(0) {
None
} else {
Some(lsn)
}
} else {
None
}
}
ComputeMode::Static(lsn) => Some(*lsn),
ComputeMode::Replica => {
None // Take the latest snapshot available to start with
}
};
self.do_basebackup(backup_lsn)?;
Ok(())
} }
pub fn endpoint_path(&self) -> PathBuf { pub fn endpoint_path(&self) -> PathBuf {
self.env.endpoints_path().join(&self.endpoint_id) self.env.endpoints_path().join(&self.name)
} }
pub fn pgdata(&self) -> PathBuf { pub fn pgdata(&self) -> PathBuf {
@@ -357,7 +433,7 @@ impl Endpoint {
pub fn status(&self) -> &str { pub fn status(&self) -> &str {
let timeout = Duration::from_millis(300); let timeout = Duration::from_millis(300);
let has_pidfile = self.pgdata().join("postmaster.pid").exists(); let has_pidfile = self.pgdata().join("postmaster.pid").exists();
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
match (has_pidfile, can_connect) { match (has_pidfile, can_connect) {
(true, true) => "running", (true, true) => "running",
@@ -375,6 +451,8 @@ impl Endpoint {
&[ &[
"-D", "-D",
self.pgdata().to_str().unwrap(), self.pgdata().to_str().unwrap(),
"-l",
self.pgdata().join("pg.log").to_str().unwrap(),
"-w", //wait till pg_ctl actually does what was asked "-w", //wait till pg_ctl actually does what was asked
], ],
args, args,
@@ -407,201 +485,39 @@ impl Endpoint {
String::from_utf8_lossy(&pg_ctl.stderr), String::from_utf8_lossy(&pg_ctl.stderr),
); );
} }
// Also wait for the compute_ctl process to die. It might have some cleanup
// work to do after postgres stops, like syncing safekeepers, etc.
//
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
Ok(()) Ok(())
} }
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> { pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
if self.status() == "running" { if self.status() == "running" {
anyhow::bail!("The endpoint is already running"); anyhow::bail!("The endpoint is already running");
} }
// Slurp the endpoints/<endpoint id>/postgresql.conf file into // 1. We always start Postgres from scratch, so
// memory. We will include it in the spec file that we pass to // if old dir exists, preserve 'postgresql.conf' and drop the directory
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf let postgresql_conf_path = self.pgdata().join("postgresql.conf");
// in the data directory. let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf"); format!(
let postgresql_conf = match std::fs::read(&postgresql_conf_path) { "failed to read config file in {}",
Ok(content) => String::from_utf8(content)?, postgresql_conf_path.to_str().unwrap()
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
Err(e) => {
return Err(anyhow::Error::new(e).context(format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)))
}
};
// We always start the compute node from scratch, so if the Postgres
// data dir exists from a previous launch, remove it first.
if self.pgdata().exists() {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
let mut safekeeper_connstrings = Vec::new();
if self.mode == ComputeMode::Primary {
for sk_id in safekeepers {
let sk = self
.env
.safekeepers
.iter()
.find(|node| node.id == sk_id)
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
}
}
// Create spec file
let spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
let logfile = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
.pg_bin_dir(self.pg_version)?
.join("postgres")
.to_str()
.unwrap(),
])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
let child = cmd.spawn()?;
// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
loop {
attempt += 1;
match self.get_status() {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
ComputeStatus::Running => {
// All good!
break;
}
ComputeStatus::Failed => {
bail!(
"compute startup failed: {}",
state
.error
.as_deref()
.unwrap_or("<no error from compute_ctl>")
);
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
}
}
}
std::thread::sleep(ATTEMPT_INTERVAL);
}
Ok(())
}
// Call the /status HTTP API
pub fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::blocking::Client::new();
let response = client
.request(
reqwest::Method::GET,
format!(
"http://{}:{}/status",
self.http_address.ip(),
self.http_address.port()
),
) )
.send()?; })?;
fs::remove_dir_all(self.pgdata())?;
self.create_pgdata()?;
// Interpret the response // 2. Bring back config files
let status = response.status(); fs::write(&postgresql_conf_path, postgresql_conf)?;
if !(status.is_client_error() || status.is_server_error()) {
Ok(response.json()?) // 3. Load basebackup
} else { self.load_basebackup(auth_token)?;
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = response.url().to_owned(); if self.mode != ComputeMode::Primary {
let msg = match response.text() { File::create(self.pgdata().join("standby.signal"))?;
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
Err(anyhow::anyhow!(msg))
} }
// 4. Finally start postgres
println!("Starting postgres at '{}'", self.connstr());
self.pg_ctl(&["start"], auth_token)
} }
pub fn stop(&self, destroy: bool) -> Result<()> { pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -618,7 +534,7 @@ impl Endpoint {
"Destroying postgres data directory '{}'", "Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap() self.pgdata().to_str().unwrap()
); );
std::fs::remove_dir_all(self.endpoint_path())?; fs::remove_dir_all(self.endpoint_path())?;
} else { } else {
self.pg_ctl(&["stop"], &None)?; self.pg_ctl(&["stop"], &None)?;
} }
@@ -627,10 +543,10 @@ impl Endpoint {
pub fn connstr(&self) -> String { pub fn connstr(&self) -> String {
format!( format!(
"postgresql://{}@{}:{}/{}", "host={} port={} user={} dbname={}",
self.address.ip(),
self.address.port(),
"cloud_admin", "cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
"postgres" "postgres"
) )
} }

View File

@@ -24,7 +24,7 @@ use utils::{
use crate::safekeeper::SafekeeperNode; use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 15; pub const DEFAULT_PG_VERSION: u32 = 14;
// //
// This data structures represents neon_local CLI config // This data structures represents neon_local CLI config
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv { pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and // Base directory for all the nodes (the pageserver, safekeepers and
// compute endpoints). // compute nodes).
// //
// This is not stored in the config file. Rather, this is the path where the // This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or // config file itself is. It is read from the NEON_REPO_DIR env variable or
@@ -137,7 +137,6 @@ impl Default for PageServerConf {
pub struct SafekeeperConf { pub struct SafekeeperConf {
pub id: NodeId, pub id: NodeId,
pub pg_port: u16, pub pg_port: u16,
pub pg_tenant_only_port: Option<u16>,
pub http_port: u16, pub http_port: u16,
pub sync: bool, pub sync: bool,
pub remote_storage: Option<String>, pub remote_storage: Option<String>,
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
Self { Self {
id: NodeId(0), id: NodeId(0),
pg_port: 0, pg_port: 0,
pg_tenant_only_port: None,
http_port: 0, http_port: 0,
sync: true, sync: true,
remote_storage: None, remote_storage: None,
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
} }
} }
impl SafekeeperConf {
/// Compute is served by port on which only tenant scoped tokens allowed, if
/// it is configured.
pub fn get_compute_port(&self) -> u16 {
self.pg_tenant_only_port.unwrap_or(self.pg_port)
}
}
impl LocalEnv { impl LocalEnv {
pub fn pg_distrib_dir_raw(&self) -> PathBuf { pub fn pg_distrib_dir_raw(&self) -> PathBuf {
self.pg_distrib_dir.clone() self.pg_distrib_dir.clone()
@@ -374,7 +364,7 @@ impl LocalEnv {
// //
// Initialize a new Neon repository // Initialize a new Neon repository
// //
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> { pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
// check if config already exists // check if config already exists
let base_path = &self.base_data_dir; let base_path = &self.base_data_dir;
ensure!( ensure!(
@@ -382,29 +372,11 @@ impl LocalEnv {
"repository base path is missing" "repository base path is missing"
); );
if base_path.exists() { ensure!(
if force { !base_path.exists(),
println!("removing all contents of '{}'", base_path.display()); "directory '{}' already exists. Perhaps already initialized?",
// instead of directly calling `remove_dir_all`, we keep the original dir but removing base_path.display()
// all contents inside. This helps if the developer symbol links another directory (i.e., );
// S3 local SSD) to the `.neon` base directory.
for entry in std::fs::read_dir(base_path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
}
} else {
bail!(
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
base_path.display()
);
}
}
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!( bail!(
"Can't find postgres binary at {}", "Can't find postgres binary at {}",
@@ -420,9 +392,7 @@ impl LocalEnv {
} }
} }
if !base_path.exists() { fs::create_dir(base_path)?;
fs::create_dir(base_path)?;
}
// Generate keypair for JWT. // Generate keypair for JWT.
// //

View File

@@ -1,9 +1,3 @@
//! Code to manage pageservers
//!
//! In the local test environment, the pageserver stores its data directly in
//!
//! .neon/
//!
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
@@ -14,7 +8,9 @@ use std::process::{Child, Command};
use std::{io, result}; use std::{io, result};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use postgres_backend::AuthType; use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig}; use postgres_connection::{parse_host_port, PgConnectionConfig};
use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -320,8 +316,8 @@ impl PageServerNode {
settings: HashMap<&str, &str>, settings: HashMap<&str, &str>,
) -> anyhow::Result<TenantId> { ) -> anyhow::Result<TenantId> {
let mut settings = settings.clone(); let mut settings = settings.clone();
let request = TenantCreateRequest {
let config = models::TenantConfig { new_tenant_id,
checkpoint_distance: settings checkpoint_distance: settings
.remove("checkpoint_distance") .remove("checkpoint_distance")
.map(|x| x.parse::<u64>()) .map(|x| x.parse::<u64>())
@@ -375,19 +371,6 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold") .remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()), .map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
};
// If tenant ID was not specified, generate one
let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
let request = models::TenantCreateRequest {
new_tenant_id,
config,
}; };
if !settings.is_empty() { if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}") bail!("Unrecognized tenant settings: {settings:?}")
@@ -408,86 +391,67 @@ impl PageServerNode {
}) })
} }
pub fn tenant_config( pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
&self, self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
tenant_id: TenantId, .json(&TenantConfigRequest {
mut settings: HashMap<&str, &str>, tenant_id,
) -> anyhow::Result<()> {
let config = {
// Braces to make the diff easier to read
models::TenantConfig {
checkpoint_distance: settings checkpoint_distance: settings
.remove("checkpoint_distance") .get("checkpoint_distance")
.map(|x| x.parse::<u64>()) .map(|x| x.parse::<u64>())
.transpose() .transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?, .context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings compaction_target_size: settings
.remove("compaction_target_size") .get("compaction_target_size")
.map(|x| x.parse::<u64>()) .map(|x| x.parse::<u64>())
.transpose() .transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?, .context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings compaction_threshold: settings
.remove("compaction_threshold") .get("compaction_threshold")
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose() .transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?, .context("Failed to parse 'compaction_threshold' as an integer")?,
gc_horizon: settings gc_horizon: settings
.remove("gc_horizon") .get("gc_horizon")
.map(|x| x.parse::<u64>()) .map(|x| x.parse::<u64>())
.transpose() .transpose()
.context("Failed to parse 'gc_horizon' as an integer")?, .context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()), gc_period: settings.get("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings image_creation_threshold: settings
.remove("image_creation_threshold") .get("image_creation_threshold")
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose() .transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?, .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout") .get("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings
.remove("lagging_wal_timeout")
.map(|x| x.to_string()), .map(|x| x.to_string()),
lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
max_lsn_wal_lag: settings max_lsn_wal_lag: settings
.remove("max_lsn_wal_lag") .get("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>()) .map(|x| x.parse::<NonZeroU64>())
.transpose() .transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings trace_read_requests: settings
.remove("trace_read_requests") .get("trace_read_requests")
.map(|x| x.parse::<bool>()) .map(|x| x.parse::<bool>())
.transpose() .transpose()
.context("Failed to parse 'trace_read_requests' as bool")?, .context("Failed to parse 'trace_read_requests' as bool")?,
eviction_policy: settings eviction_policy: settings
.remove("eviction_policy") .get("eviction_policy")
.map(serde_json::from_str) .map(|x| serde_json::from_str(x))
.transpose() .transpose()
.context("Failed to parse 'eviction_policy' json")?, .context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings min_resident_size_override: settings
.remove("min_resident_size_override") .get("min_resident_size_override")
.map(|x| x.parse::<u64>()) .map(|x| x.parse::<u64>())
.transpose() .transpose()
.context("Failed to parse 'min_resident_size_override' as an integer")?, .context("Failed to parse 'min_resident_size_override' as an integer")?,
evictions_low_residence_duration_metric_threshold: settings evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold") .get("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()), .map(|x| x.to_string()),
gc_feedback: settings })
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
}
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
.json(&models::TenantConfigRequest { tenant_id, config })
.send()? .send()?
.error_from_body()?; .error_from_body()?;
@@ -515,14 +479,11 @@ impl PageServerNode {
ancestor_timeline_id: Option<TimelineId>, ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>, pg_version: Option<u32>,
) -> anyhow::Result<TimelineInfo> { ) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
self.http_request( self.http_request(
Method::POST, Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)? )?
.json(&models::TimelineCreateRequest { .json(&TimelineCreateRequest {
new_timeline_id, new_timeline_id,
ancestor_start_lsn, ancestor_start_lsn,
ancestor_timeline_id, ancestor_timeline_id,

View File

@@ -1,10 +1,3 @@
//! Code to manage safekeepers
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
use std::io::Write; use std::io::Write;
use std::path::PathBuf; use std::path::PathBuf;
use std::process::Child; use std::process::Child;
@@ -120,55 +113,45 @@ impl SafekeeperNode {
let availability_zone = format!("sk-{}", id_string); let availability_zone = format!("sk-{}", id_string);
let mut args = vec![ let mut args = vec![
"-D".to_owned(), "-D",
datadir datadir.to_str().with_context(|| {
.to_str() format!("Datadir path {datadir:?} cannot be represented as a unicode string")
.with_context(|| { })?,
format!("Datadir path {datadir:?} cannot be represented as a unicode string") "--id",
})? &id_string,
.to_owned(), "--listen-pg",
"--id".to_owned(), &listen_pg,
id_string, "--listen-http",
"--listen-pg".to_owned(), &listen_http,
listen_pg, "--availability-zone",
"--listen-http".to_owned(), &availability_zone,
listen_http,
"--availability-zone".to_owned(),
availability_zone,
]; ];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
}
if !self.conf.sync { if !self.conf.sync {
args.push("--no-sync".to_owned()); args.push("--no-sync");
} }
let broker_endpoint = format!("{}", self.env.broker.client_url()); let broker_endpoint = format!("{}", self.env.broker.client_url());
args.extend(["--broker-endpoint".to_owned(), broker_endpoint]); args.extend(["--broker-endpoint", &broker_endpoint]);
let mut backup_threads = String::new(); let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads { if let Some(threads) = self.conf.backup_threads {
backup_threads = threads.to_string(); backup_threads = threads.to_string();
args.extend(["--backup-threads".to_owned(), backup_threads]); args.extend(["--backup-threads", &backup_threads]);
} else { } else {
drop(backup_threads); drop(backup_threads);
} }
if let Some(ref remote_storage) = self.conf.remote_storage { if let Some(ref remote_storage) = self.conf.remote_storage {
args.extend(["--remote-storage".to_owned(), remote_storage.clone()]); args.extend(["--remote-storage", remote_storage]);
} }
let key_path = self.env.base_data_dir.join("auth_public_key.pem"); let key_path = self.env.base_data_dir.join("auth_public_key.pem");
if self.conf.auth_enabled { if self.conf.auth_enabled {
args.extend([ args.extend([
"--auth-validation-public-key-path".to_owned(), "--auth-validation-public-key-path",
key_path key_path.to_str().with_context(|| {
.to_str() format!("Key path {key_path:?} cannot be represented as a unicode string")
.with_context(|| { })?,
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?
.to_owned(),
]); ]);
} }

View File

@@ -1,14 +1,6 @@
#!/bin/bash #!/bin/bash
set -eux set -eux
# Generate a random tenant or timeline ID
#
# Takes a variable name as argument. The result is stored in that variable.
generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
PG_VERSION=${PG_VERSION:-14} PG_VERSION=${PG_VERSION:-14}
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -21,29 +13,29 @@ done
echo "Page server is ready." echo "Page server is ready."
echo "Create a tenant and timeline" echo "Create a tenant and timeline"
generate_id tenant_id
PARAMS=( PARAMS=(
-sb -sb
-X POST -X POST
-H "Content-Type: application/json" -H "Content-Type: application/json"
-d "{\"new_tenant_id\": \"${tenant_id}\"}" -d "{}"
http://pageserver:9898/v1/tenant/ http://pageserver:9898/v1/tenant/
) )
result=$(curl "${PARAMS[@]}") tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
echo $result | jq .
generate_id timeline_id
PARAMS=( PARAMS=(
-sb -sb
-X POST -X POST
-H "Content-Type: application/json" -H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
) )
result=$(curl "${PARAMS[@]}") result=$(curl "${PARAMS[@]}")
echo $result | jq . echo $result | jq .
echo "Overwrite tenant id and timeline id in spec file" echo "Overwrite tenant id and timeline id in spec file"
tenant_id=$(echo ${result} | jq -r .tenant_id)
timeline_id=$(echo ${result} | jq -r .timeline_id)
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}

View File

@@ -189,7 +189,7 @@ services:
- "/bin/bash" - "/bin/bash"
- "-c" - "-c"
command: command:
- "until pg_isready -h compute -p 55433 -U cloud_admin ; do - "until pg_isready -h compute -p 55433 ; do
echo 'Waiting to start compute...' && sleep 1; echo 'Waiting to start compute...' && sleep 1;
done" done"
depends_on: depends_on:

View File

@@ -48,7 +48,6 @@ Creating docker-compose_storage_broker_1 ... done
2. connect compute node 2. connect compute node
``` ```
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
$ chmod 600 ~/.pgpass
$ psql -h localhost -p 55433 -U cloud_admin $ psql -h localhost -p 55433 -U cloud_admin
postgres=# CREATE TABLE t(key int primary key, value text); postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE CREATE TABLE

View File

@@ -4,11 +4,6 @@ The pageserver uses Tokio for handling concurrency. Everything runs in
Tokio tasks, although some parts are written in blocking style and use Tokio tasks, although some parts are written in blocking style and use
spawn_blocking(). spawn_blocking().
We currently use std blocking functions for disk I/O, however. The
current model is that we consider disk I/Os to be short enough that we
perform them while running in a Tokio task. Changing all the disk I/O
calls to async is a TODO.
Each Tokio task is tracked by the `task_mgr` module. It maintains a Each Tokio task is tracked by the `task_mgr` module. It maintains a
registry of tasks, and which tenant or timeline they are operating registry of tasks, and which tenant or timeline they are operating
on. on.
@@ -26,84 +21,19 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!`
or similar, to wake up on shutdown. or similar, to wake up on shutdown.
### Async cancellation safety ### Sync vs async
In async Rust, futures can be "cancelled" at any await point, by We use async to wait for incoming data on network connections, and to
dropping the Future. For example, `tokio::select!` returns as soon as perform other long-running operations. For example, each WAL receiver
one of the Futures returns, and drops the others. `tokio::time::timeout` connection is handled by a tokio Task. Once a piece of WAL has been
is another example. In the Rust ecosystem, some functions are received from the network, the task calls the blocking functions in
cancellation-safe, meaning they can be safely dropped without the Repository to process the WAL.
side-effects, while others are not. See documentation of
`tokio::select!` for examples.
In the pageserver and safekeeper, async code is *not* The core storage code in `layered_repository/` is synchronous, with
cancellation-safe by default. Unless otherwise marked, any async blocking locks and I/O calls. The current model is that we consider
function that you call cannot be assumed to be async disk I/Os to be short enough that we perform them while running in a
cancellation-safe, and must be polled to completion. Tokio task. If that becomes a problem, we should use `spawn_blocking`
before entering the synchronous parts of the code, or switch to using
tokio I/O functions.
The downside of non-cancellation safe code is that you have to be very Be very careful when mixing sync and async code!
careful when using `tokio::select!`, `tokio::time::timeout`, and other
such functions that can cause a Future to be dropped. They can only be
used with functions that are explicitly documented to be cancellation-safe,
or you need to spawn a separate task to shield from the cancellation.
At the entry points to the code, we also take care to poll futures to
completion, or shield the rest of the code from surprise cancellations
by spawning a separate task. The code that handles incoming HTTP
requests, for example, spawns a separate task for each request,
because Hyper will drop the request-handling Future if the HTTP
connection is lost.
#### How to cancel, then?
If our code is not cancellation-safe, how do you cancel long-running
tasks? Use CancellationTokens.
TODO: More details on that. And we have an ongoing discussion on what
to do if cancellations might come from multiple sources.
#### Exceptions
Some library functions are cancellation-safe, and are explicitly marked
as such. For example, `utils::seqwait`.
#### Rationale
The alternative would be to make all async code cancellation-safe,
unless otherwise marked. That way, you could use `tokio::select!` more
liberally. The reasons we didn't choose that are explained in this
section.
Writing code in a cancellation-safe manner is tedious, as you need to
scrutinize every `.await` and ensure that if the `.await` call never
returns, the system is in a safe, consistent state. In some ways, you
need to do that with `?` and early `returns`, too, but `.await`s are
easier to miss. It is also easier to perform cleanup tasks when a
function returns an `Err` than when an `.await` simply never
returns. You can use `scopeguard` and Drop guards to perform cleanup
tasks, but it is more tedious. An `.await` that never returns is more
similar to a panic.
Note that even if you only use building blocks that themselves are
cancellation-safe, it doesn't mean that the code as whole is
cancellation-safe. For example, consider the following code:
```
while let Some(i) = work_inbox.recv().await {
if let Err(_) = results_outbox.send(i).await {
println!("receiver dropped");
return;
}
}
}
```
It reads messages from one channel, sends them to another channel. If
this code is cancelled at the `results_outbox.send(i).await`, the
message read from the receiver is lost. That may or may not be OK,
depending on the context.
Another reason to not require cancellation-safety is historical: we
already had a lot of async code that was not scrutinized for
cancellation-safety when this issue was raised. Scrutinizing all
existing code is no fun.

View File

@@ -1,232 +0,0 @@
# The state of pageserver tenant relocation
Created on 17.03.23
## Motivation
There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc.
Related (in chronological order):
- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886)
- [015. Storage Messaging](015-storage-messaging.md)
- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md)
## Summary
The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches.
1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime.
2. More complicated approach that avoids even short downtime.
3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers).
The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3.
## Components
pageserver, control-plane, safekeepers (a bit)
## Requirements
Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine.
- component restarts
- component outage
- pageserver loss
## The original proposed implementation
The starting point is this sequence:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS2: Attach tenant X
PS2->>S3: Fetch timelines, indexes for them
PS2->>CP: Accepted
CP->>CP: Change pageserver id in project
CP->>PS1: Detach
```
Which problems do we have with naive approach?
### Concurrent GC and Compaction
The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md)
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
participant PS2
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
PS2->>S3: Attach called, sees L1, L2
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
note over S3: Index now L2, L3
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
note over S3: Index now L1, L2, L4
```
At this point it is not possible to restore the state from index, it contains L2 which
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart, initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests.
#### Options
There are several options on how to restrict concurrent access to index file.
First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads.
So the sequence becomes:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X.
PS2->>S3: Fetch timelines, index, start background operations
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>PS1: Detach
```
The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver?
There are two questions:
#### How can we detect that something went wrong?
We can run usual availability check (consists of compute startup and an update of one row).
Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one.
#### What can go wrong? And how we can safely roll-back?
In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state.
Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state.
So the sequence for happy path becomes this one:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X in remote readonly mode.
PS2->>S3: Fetch timelines, index
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>CP: Run successful availability check
CP->>PS2: Start uploads, background tasks
CP->>PS1: Detach
```
With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver.
The sequence with roll back process:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant S3
CP->>PS1: Pause background jobs, pause uploading new layers.
CP->>PS2: Attach tenant X in remote readonly mode.
PS2->>S3: Fetch timelines, index
PS2->>CP: Accepted
CP->>CP: Monitor PS2 last record lsn, ensure OK lag
CP->>CP: Change pageserver id in project
CP->>CP: Availability check Failed
CP->>CP: Change pageserver id back
CP->>PS1: Resume remote operations
CP->>PS2: Ignore (instead of detach for investigation purposes)
```
## Concurrent branch creation
Another problem is a possibility of concurrent branch creation calls.
I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
## Simplistic approach
The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
The happy path sequence:
```mermaid
sequenceDiagram
autonumber
participant CP as Control Plane
participant PS1 as Pageserver 1
participant PS2 as Pageserver 2
participant SK as Safekeeper
participant S3
CP->>CP: Enable maintenance mode
CP->>PS1: Ignore
CP->>PS2: Attach
PS2->>CP: Accepted
loop Delete layers for each timeline
CP->>PS2: Get last record lsn
CP->>SK: Get commit lsn
CP->>CP: OK? Timed out?
end
CP->>CP: Change pageserver id in project
CP->>CP: Run successful availability check
CP->>CP: Disable maintenance mode
CP->>PS1: Detach ignored
```
The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way.
Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one.
## Lease based approach
In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration.
NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach
The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role.
There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required.
The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers.
There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one.
## Summary
Proposed implementation strategy:
Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation.
And only then go to lease based approach to solve HA/Pageserver replica use cases.

View File

@@ -1,84 +0,0 @@
# Postgres user and database management
(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
We've accumulated a bunch of problems with our approach to role and database management, namely:
1. we don't allow role and database creation from Postgres, and users are complaining about that
2. fine-grained role management is not possible both from Postgres and console
Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
that:
* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
* it is handy when we can render console UI without waking up compute (e.g., show database list)
This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
## Overview
* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
## Postgres behavior
The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
* roles created via SQL will *not* have raw passwords in the console
* roles created via SQL will have raw passwords in the console, except ones that were created using hashes
I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
* Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
* When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
* Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
* We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
## Console user management API
The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
```
curl -X PATCH /api/v1/roles_and_databases -d '
[
{"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
{"op":"drop", "type":"role", "name": "trout"},
{"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
{"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
]
'
```
Makes sense not to error out on duplicated create/delete operations (see failure modes)
## Managing users from the console
Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
Generally, we have several options when we are creating users via console:
1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
## Failure modes
* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
# Scalability
On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).

View File

@@ -1,22 +0,0 @@
# Useful development tools
This readme contains some hints on how to set up some optional development tools.
## ccls
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
to work well. There are different ways to do it but here's what works for me:
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
2. Go to `vendor/postgres-v15`
3. Run `make clean && ./configure`
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
Some additional tips for various IDEs:
### Emacs
To improve performance: `(setq lsp-lens-enable nil)`

View File

@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
use crate::spec::ComputeSpec; use crate::spec::ComputeSpec;
#[derive(Serialize, Debug, Deserialize)] #[derive(Serialize, Debug)]
pub struct GenericAPIError { pub struct GenericAPIError {
pub error: String, pub error: String,
} }
/// Response of the /status API /// Response of the /status API
#[derive(Serialize, Debug, Deserialize)] #[derive(Serialize, Debug)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub struct ComputeStatusResponse { pub struct ComputeStatusResponse {
pub start_time: DateTime<Utc>, pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
pub error: Option<String>, pub error: Option<String>,
} }
#[derive(Deserialize, Serialize)] #[derive(Serialize)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub struct ComputeState { pub struct ComputeState {
pub status: ComputeStatus, pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
pub error: Option<String>, pub error: Option<String>,
} }
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum ComputeStatus { pub enum ComputeStatus {
// Spec wasn't provided at start, waiting for it to be // Spec wasn't provided at start, waiting for it to be
@@ -70,10 +70,7 @@ where
pub struct ComputeMetrics { pub struct ComputeMetrics {
pub wait_for_spec_ms: u64, pub wait_for_spec_ms: u64,
pub sync_safekeepers_ms: u64, pub sync_safekeepers_ms: u64,
pub sync_sk_check_ms: u64,
pub basebackup_ms: u64, pub basebackup_ms: u64,
pub basebackup_bytes: u64,
pub start_postgres_ms: u64,
pub config_ms: u64, pub config_ms: u64,
pub total_startup_ms: u64, pub total_startup_ms: u64,
} }

View File

@@ -5,7 +5,6 @@
//! and connect it to the storage nodes. //! and connect it to the storage nodes.
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr}; use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
/// String type alias representing Postgres identifier and /// String type alias representing Postgres identifier and
@@ -15,7 +14,7 @@ pub type PgIdent = String;
/// Cluster spec or configuration represented as an optional number of /// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description. /// delta operations + final cluster state description.
#[serde_as] #[serde_as]
#[derive(Clone, Debug, Default, Deserialize, Serialize)] #[derive(Clone, Debug, Default, Deserialize)]
pub struct ComputeSpec { pub struct ComputeSpec {
pub format_version: f32, pub format_version: f32,
@@ -27,38 +26,9 @@ pub struct ComputeSpec {
pub cluster: Cluster, pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>, pub delta_operations: Option<Vec<DeltaOp>>,
/// An optinal hint that can be passed to speed up startup time if we know
/// that no pg catalog mutations (like role creation, database creation,
/// extension creation) need to be done on the actual database to start.
#[serde(default)] // Default false
pub skip_pg_catalog_updates: bool,
// Information needed to connect to the storage layer.
//
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
//
// Depending on `mode`, this can be a primary read-write node, a read-only
// replica, or a read-only node pinned at an older LSN.
// `safekeeper_connstrings` must be set for a primary.
//
// For backwards compatibility, the control plane may leave out all of
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub pageserver_connstring: Option<String>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
#[serde(default)] #[serde(default)]
pub mode: ComputeMode, pub mode: ComputeMode,
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>, pub storage_auth_token: Option<String>,
} }
@@ -77,19 +47,13 @@ pub enum ComputeMode {
Replica, Replica,
} }
#[derive(Clone, Debug, Default, Deserialize, Serialize)] #[derive(Clone, Debug, Default, Deserialize)]
pub struct Cluster { pub struct Cluster {
pub cluster_id: Option<String>, pub cluster_id: String,
pub name: Option<String>, pub name: String,
pub state: Option<String>, pub state: Option<String>,
pub roles: Vec<Role>, pub roles: Vec<Role>,
pub databases: Vec<Database>, pub databases: Vec<Database>,
/// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
/// tool may add additional settings to the final file.)
pub postgresql_conf: Option<String>,
/// Additional settings that will be appended to the 'postgresql.conf' file.
pub settings: GenericOptions, pub settings: GenericOptions,
} }
@@ -99,7 +63,7 @@ pub struct Cluster {
/// - DROP ROLE /// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name /// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize)]
pub struct DeltaOp { pub struct DeltaOp {
pub action: String, pub action: String,
pub name: PgIdent, pub name: PgIdent,
@@ -108,7 +72,7 @@ pub struct DeltaOp {
/// Rust representation of Postgres role info with only those fields /// Rust representation of Postgres role info with only those fields
/// that matter for us. /// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize)]
pub struct Role { pub struct Role {
pub name: PgIdent, pub name: PgIdent,
pub encrypted_password: Option<String>, pub encrypted_password: Option<String>,
@@ -117,7 +81,7 @@ pub struct Role {
/// Rust representation of Postgres database info with only those fields /// Rust representation of Postgres database info with only those fields
/// that matter for us. /// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize)]
pub struct Database { pub struct Database {
pub name: PgIdent, pub name: PgIdent,
pub owner: PgIdent, pub owner: PgIdent,
@@ -127,7 +91,7 @@ pub struct Database {
/// Common type representing both SQL statement params with or without value, /// Common type representing both SQL statement params with or without value,
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
/// options like `wal_level = logical`. /// options like `wal_level = logical`.
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize)]
pub struct GenericOption { pub struct GenericOption {
pub name: String, pub name: String,
pub value: Option<String>, pub value: Option<String>,
@@ -148,14 +112,4 @@ mod tests {
let file = File::open("tests/cluster_spec.json").unwrap(); let file = File::open("tests/cluster_spec.json").unwrap();
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap(); let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
} }
#[test]
fn parse_unknown_fields() {
// Forward compatibility test
let file = File::open("tests/cluster_spec.json").unwrap();
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
let ob = json.as_object_mut().unwrap();
ob.insert("unknown_field_123123123".into(), "hello".into());
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
}
} }

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use rand::Rng; use rand::Rng;
use serde::Serialize; use serde::Serialize;
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")] #[serde(tag = "type")]
pub enum EventType { pub enum EventType {
#[serde(rename = "absolute")] #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
}, },
} }
impl EventType {
pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
use EventType::*;
match self {
Absolute { time } => Some(time),
_ => None,
}
}
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
// these can most likely be thought of as Range or RangeFull
use EventType::*;
match self {
Incremental {
start_time,
stop_time,
} => Some(start_time..stop_time),
_ => None,
}
}
pub fn is_incremental(&self) -> bool {
matches!(self, EventType::Incremental { .. })
}
}
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra> { pub struct Event<Extra> {
#[serde(flatten)] #[serde(flatten)]

View File

@@ -6,7 +6,6 @@ use once_cell::sync::Lazy;
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts; pub use prometheus::opts;
pub use prometheus::register; pub use prometheus::register;
pub use prometheus::Error;
pub use prometheus::{core, default_registry, proto}; pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec}; pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -24,7 +23,6 @@ use prometheus::{Registry, Result};
pub mod launch_timestamp; pub mod launch_timestamp;
mod wrappers; mod wrappers;
pub use wrappers::{CountedReader, CountedWriter}; pub use wrappers::{CountedReader, CountedWriter};
pub mod metric_vec_duration;
pub type UIntGauge = GenericGauge<AtomicU64>; pub type UIntGauge = GenericGauge<AtomicU64>;
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>; pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;

View File

@@ -1,23 +0,0 @@
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
use std::{future::Future, time::Instant};
pub trait DurationResultObserver {
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
}
pub async fn observe_async_block_duration_by_result<
T,
E,
F: Future<Output = Result<T, E>>,
O: DurationResultObserver,
>(
observer: &O,
block: F,
) -> Result<T, E> {
let start = Instant::now();
let result = block.await;
let duration = start.elapsed();
observer.observe_result(&result, duration);
result
}

View File

@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr}; use serde_with::{serde_as, DisplayFromStr};
use strum_macros; use strum_macros;
use utils::{ use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter, history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
@@ -19,29 +18,7 @@ use crate::reltag::RelTag;
use anyhow::bail; use anyhow::bail;
use bytes::{BufMut, Bytes, BytesMut}; use bytes::{BufMut, Bytes, BytesMut};
/// The state of a tenant in this pageserver. /// A state of a tenant in pageserver's memory.
///
/// ```mermaid
/// stateDiagram-v2
///
/// [*] --> Loading: spawn_load()
/// [*] --> Attaching: spawn_attach()
///
/// Loading --> Activating: activate()
/// Attaching --> Activating: activate()
/// Activating --> Active: infallible
///
/// Loading --> Broken: load() failure
/// Attaching --> Broken: attach() failure
///
/// Active --> Stopping: set_stopping(), part of shutdown & detach
/// Stopping --> Broken: late error in remove_tenant_from_memory
///
/// Broken --> [*]: ignore / detach / shutdown
/// Stopping --> [*]: remove_from_memory complete
///
/// Active --> Broken: cfg(testing)-only tenant break point
/// ```
#[derive( #[derive(
Clone, Clone,
PartialEq, PartialEq,
@@ -49,82 +26,55 @@ use bytes::{BufMut, Bytes, BytesMut};
serde::Serialize, serde::Serialize,
serde::Deserialize, serde::Deserialize,
strum_macros::Display, strum_macros::Display,
strum_macros::EnumString,
strum_macros::EnumVariantNames, strum_macros::EnumVariantNames,
strum_macros::AsRefStr, strum_macros::AsRefStr,
strum_macros::IntoStaticStr, strum_macros::IntoStaticStr,
)] )]
#[serde(tag = "slug", content = "data")] #[serde(tag = "slug", content = "data")]
pub enum TenantState { pub enum TenantState {
/// This tenant is being loaded from local disk. /// This tenant is being loaded from local disk
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
Loading, Loading,
/// This tenant is being attached to the pageserver. /// This tenant is being downloaded from cloud storage.
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
Attaching, Attaching,
/// The tenant is transitioning from Loading/Attaching to Active. /// Tenant is fully operational
///
/// While in this state, the individual timelines are being activated.
///
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
Activating(ActivatingFrom),
/// The tenant has finished activating and is open for business.
///
/// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
Active, Active,
/// The tenant is recognized by pageserver, but it is being detached or the /// A tenant is recognized by pageserver, but it is being detached or the
/// system is being shut down. /// system is being shut down.
/// Stopping,
/// Transitions out of this state are possible through `set_broken()`. /// A tenant is recognized by the pageserver, but can no longer be used for
Stopping { /// any operations, because it failed to be activated.
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
// otherwise it will not be skipped during deserialization
#[serde(skip)]
progress: completion::Barrier,
},
/// The tenant is recognized by the pageserver, but can no longer be used for
/// any operations.
///
/// If the tenant fails to load or attach, it will transition to this state
/// and it is guaranteed that no background tasks are running in its name.
///
/// The other way to transition into this state is from `Stopping` state
/// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
/// if the cleanup future executed by `remove_tenant_from_memory()` fails.
Broken { reason: String, backtrace: String }, Broken { reason: String, backtrace: String },
} }
impl TenantState { impl TenantState {
pub fn attachment_status(&self) -> TenantAttachmentStatus { pub fn attachment_status(&self) -> TenantAttachmentStatus {
use TenantAttachmentStatus::*; use TenantAttachmentStatus::*;
// Below TenantState::Activating is used as "transient" or "transparent" state for
// attachment_status determining.
match self { match self {
// The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
// So, technically, we can return Attached here. // So, technically, we can return Attached here.
// However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
// But, our attach task might still be fetching the remote timelines, etc. // But, our attach task might still be fetching the remote timelines, etc.
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, Self::Attaching => Maybe,
// tenant mgr startup distinguishes attaching from loading via marker file. // tenant mgr startup distinguishes attaching from loading via marker file.
// If it's loading, there is no attach marker file, i.e., attach had finished in the past. // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, Self::Loading => Attached,
// We only reach Active after successful load / attach. // We only reach Active after successful load / attach.
// So, call atttachment status Attached. // So, call atttachment status Attached.
Self::Active => Attached, Self::Active => Attached,
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken. // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
// However, it also becomes Broken if the regular load fails. // However, it also becomes Broken if the regular load fails.
// From Console's perspective there's no practical difference // We would need a separate TenantState variant to distinguish these cases.
// because attachment_status is polled by console only during attach operation execution. // However, there's no practical difference from Console's perspective.
Self::Broken { reason, .. } => Failed { // It will run a Postgres-level health check as soon as it observes Attached.
reason: reason.to_owned(), // That will fail on Broken tenants.
}, // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
Self::Broken { .. } => Attached,
// Why is Stopping a Maybe case? Because, during pageserver shutdown, // Why is Stopping a Maybe case? Because, during pageserver shutdown,
// we set the Stopping state irrespective of whether the tenant // we set the Stopping state irrespective of whether the tenant
// has finished attaching or not. // has finished attaching or not.
Self::Stopping { .. } => Maybe, Self::Stopping => Maybe,
} }
} }
@@ -148,17 +98,8 @@ impl std::fmt::Debug for TenantState {
} }
} }
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ActivatingFrom {
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
Loading,
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
Attaching,
}
/// A state of a timeline in pageserver's memory. /// A state of a timeline in pageserver's memory.
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TimelineState { pub enum TimelineState {
/// The timeline is recognized by the pageserver but is not yet operational. /// The timeline is recognized by the pageserver but is not yet operational.
/// In particular, the walreceiver connection loop is not running for this timeline. /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -171,14 +112,15 @@ pub enum TimelineState {
/// It cannot transition back into any other state. /// It cannot transition back into any other state.
Stopping, Stopping,
/// The timeline is broken and not operational (previous states: Loading or Active). /// The timeline is broken and not operational (previous states: Loading or Active).
Broken { reason: String, backtrace: String }, Broken,
} }
#[serde_as] #[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest { pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")] #[serde(default)]
pub new_timeline_id: TimelineId, #[serde_as(as = "Option<DisplayFromStr>")]
pub new_timeline_id: Option<TimelineId>,
#[serde(default)] #[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")] #[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>, pub ancestor_timeline_id: Option<TimelineId>,
@@ -189,25 +131,11 @@ pub struct TimelineCreateRequest {
} }
#[serde_as] #[serde_as]
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct TenantCreateRequest { pub struct TenantCreateRequest {
#[serde_as(as = "DisplayFromStr")] #[serde(default)]
pub new_tenant_id: TenantId, #[serde_as(as = "Option<DisplayFromStr>")]
#[serde(flatten)] pub new_tenant_id: Option<TenantId>,
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
impl std::ops::Deref for TenantCreateRequest {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.config
}
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantConfig {
pub checkpoint_distance: Option<u64>, pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>, pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>, pub compaction_target_size: Option<u64>,
@@ -228,7 +156,6 @@ pub struct TenantConfig {
pub eviction_policy: Option<serde_json::Value>, pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>, pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>, pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
} }
#[serde_as] #[serde_as]
@@ -242,35 +169,46 @@ pub struct StatusResponse {
} }
impl TenantCreateRequest { impl TenantCreateRequest {
pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest { pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
TenantCreateRequest { TenantCreateRequest {
new_tenant_id, new_tenant_id,
config: TenantConfig::default(), ..Default::default()
} }
} }
} }
#[serde_as] #[serde_as]
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct TenantConfigRequest { pub struct TenantConfigRequest {
#[serde_as(as = "DisplayFromStr")] #[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
#[serde(flatten)] #[serde(default)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it pub checkpoint_distance: Option<u64>,
} pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
impl std::ops::Deref for TenantConfigRequest { pub compaction_period: Option<String>,
type Target = TenantConfig; pub compaction_threshold: Option<usize>,
pub gc_horizon: Option<u64>,
fn deref(&self) -> &Self::Target { pub gc_period: Option<String>,
&self.config pub image_creation_threshold: Option<usize>,
} pub pitr_interval: Option<String>,
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
// We defer the parsing of the eviction_policy field to the request handler.
// Otherwise we'd have to move the types for eviction policy into this package.
// We might do that once the eviction feature has stabilizied.
// For now, this field is not even documented in the openapi_spec.yml.
pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
} }
impl TenantConfigRequest { impl TenantConfigRequest {
pub fn new(tenant_id: TenantId) -> TenantConfigRequest { pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
let config = TenantConfig { TenantConfigRequest {
tenant_id,
checkpoint_distance: None, checkpoint_distance: None,
checkpoint_timeout: None, checkpoint_timeout: None,
compaction_target_size: None, compaction_target_size: None,
@@ -287,41 +225,16 @@ impl TenantConfigRequest {
eviction_policy: None, eviction_policy: None,
min_resident_size_override: None, min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None, evictions_low_residence_duration_metric_threshold: None,
gc_feedback: None, }
};
TenantConfigRequest { tenant_id, config }
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TenantAttachRequest {
pub config: TenantAttachConfig,
}
/// Newtype to enforce deny_unknown_fields on TenantConfig for
/// its usage inside `TenantAttachRequest`.
#[derive(Debug, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct TenantAttachConfig {
#[serde(flatten)]
allowing_unknown_fields: TenantConfig,
}
impl std::ops::Deref for TenantAttachConfig {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.allowing_unknown_fields
} }
} }
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context. /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
#[serde(tag = "slug", content = "data", rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum TenantAttachmentStatus { pub enum TenantAttachmentStatus {
Maybe, Maybe,
Attached, Attached,
Failed { reason: String },
} }
#[serde_as] #[serde_as]
@@ -417,16 +330,12 @@ pub struct LayerResidenceEvent {
pub reason: LayerResidenceEventReason, pub reason: LayerResidenceEventReason,
} }
/// The reason for recording a given [`LayerResidenceEvent`]. /// The reason for recording a given [`ResidenceEvent`].
#[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum LayerResidenceEventReason { pub enum LayerResidenceEventReason {
/// The layer map is being populated, e.g. during timeline load or attach. /// The layer map is being populated, e.g. during timeline load or attach.
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
/// We need to record such events because there is no persistent storage for the events. /// We need to record such events because there is no persistent storage for the events.
///
// https://github.com/rust-lang/rust/issues/74481
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
LayerLoad, LayerLoad,
/// We just created the layer (e.g., freeze_and_flush or compaction). /// We just created the layer (e.g., freeze_and_flush or compaction).
/// Such layers are always [`LayerResidenceStatus::Resident`]. /// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -819,9 +728,7 @@ mod tests {
"slug": "Active", "slug": "Active",
}, },
"current_physical_size": 42, "current_physical_size": 42,
"attachment_status": { "attachment_status": "attached",
"slug":"attached",
}
}); });
let original_broken = TenantInfo { let original_broken = TenantInfo {
@@ -843,9 +750,7 @@ mod tests {
} }
}, },
"current_physical_size": 42, "current_physical_size": 42,
"attachment_status": { "attachment_status": "attached",
"slug":"attached",
}
}); });
assert_eq!( assert_eq!(
@@ -860,100 +765,4 @@ mod tests {
assert!(format!("{:?}", &original_broken.state).contains("reason")); assert!(format!("{:?}", &original_broken.state).contains("reason"));
assert!(format!("{:?}", &original_broken.state).contains("backtrace info")); assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
} }
#[test]
fn test_reject_unknown_field() {
let id = TenantId::generate();
let create_request = json!({
"new_tenant_id": id.to_string(),
"unknown_field": "unknown_value".to_string(),
});
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
let id = TenantId::generate();
let config_request = json!({
"tenant_id": id.to_string(),
"unknown_field": "unknown_value".to_string(),
});
let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
let attach_request = json!({
"config": {
"unknown_field": "unknown_value".to_string(),
},
});
let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {}",
err
);
}
#[test]
fn tenantstatus_activating_serde() {
let states = [
TenantState::Activating(ActivatingFrom::Loading),
TenantState::Activating(ActivatingFrom::Attaching),
];
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
let actual = serde_json::to_string(&states).unwrap();
assert_eq!(actual, expected);
let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
assert_eq!(states.as_slice(), &parsed);
}
#[test]
fn tenantstatus_activating_strum() {
// tests added, because we use these for metrics
let examples = [
(line!(), TenantState::Loading, "Loading"),
(line!(), TenantState::Attaching, "Attaching"),
(
line!(),
TenantState::Activating(ActivatingFrom::Loading),
"Activating",
),
(
line!(),
TenantState::Activating(ActivatingFrom::Attaching),
"Activating",
),
(line!(), TenantState::Active, "Active"),
(
line!(),
TenantState::Stopping {
progress: utils::completion::Barrier::default(),
},
"Stopping",
),
(
line!(),
TenantState::Broken {
reason: "Example".into(),
backtrace: "Looooong backtrace".into(),
},
"Broken",
),
];
for (line, rendered, expected) in examples {
let actual: &'static str = rendered.into();
assert_eq!(actual, expected, "example on {line}");
}
}
} }

View File

@@ -60,9 +60,8 @@ impl Ord for RelTag {
/// Display RelTag in the same format that's used in most PostgreSQL debug messages: /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
/// ///
/// ```text
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init] /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
/// ``` ///
impl fmt::Display for RelTag { impl fmt::Display for RelTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(forkname) = forknumber_to_name(self.forknum) { if let Some(forkname) = forknumber_to_name(self.forknum) {

View File

@@ -24,6 +24,7 @@ workspace_hack.workspace = true
[dev-dependencies] [dev-dependencies]
env_logger.workspace = true env_logger.workspace = true
postgres.workspace = true postgres.workspace = true
wal_craft = { path = "wal_craft" }
[build-dependencies] [build-dependencies]
anyhow.workspace = true anyhow.workspace = true

View File

@@ -33,7 +33,6 @@ macro_rules! postgres_ffi {
} }
pub mod controlfile_utils; pub mod controlfile_utils;
pub mod nonrelfile_utils; pub mod nonrelfile_utils;
pub mod wal_craft_test_export;
pub mod waldecoder_handler; pub mod waldecoder_handler;
pub mod xlog_utils; pub mod xlog_utils;
@@ -46,15 +45,8 @@ macro_rules! postgres_ffi {
}; };
} }
#[macro_export] postgres_ffi!(v14);
macro_rules! for_all_postgres_versions { postgres_ffi!(v15);
($macro:tt) => {
$macro!(v14);
$macro!(v15);
};
}
for_all_postgres_versions! { postgres_ffi }
pub mod pg_constants; pub mod pg_constants;
pub mod relfile_utils; pub mod relfile_utils;

View File

@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
// Multixact utils // Multixact utils
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize { pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32 % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
} }
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 { pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
@@ -81,41 +81,3 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
pub fn mx_offset_to_member_segment(xid: u32) -> i32 { pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
(mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32 (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_multixid_calc() {
// Check that the mx_offset_* functions produce the same values as the
// corresponding PostgreSQL C macros (MXOffsetTo*). These test values
// were generated by calling the PostgreSQL macros with a little C
// program.
assert_eq!(mx_offset_to_member_segment(0), 0);
assert_eq!(mx_offset_to_member_page(0), 0);
assert_eq!(mx_offset_to_flags_offset(0), 0);
assert_eq!(mx_offset_to_flags_bitshift(0), 0);
assert_eq!(mx_offset_to_member_offset(0), 4);
assert_eq!(mx_offset_to_member_segment(1), 0);
assert_eq!(mx_offset_to_member_page(1), 0);
assert_eq!(mx_offset_to_flags_offset(1), 0);
assert_eq!(mx_offset_to_flags_bitshift(1), 8);
assert_eq!(mx_offset_to_member_offset(1), 8);
assert_eq!(mx_offset_to_member_segment(123456789), 2358);
assert_eq!(mx_offset_to_member_page(123456789), 75462);
assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
assert_eq!(mx_offset_to_member_offset(123456789), 4788);
assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
}
}

View File

@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
} }
} }
///
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple. /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
/// ///
/// Formats: /// Formats:
///
/// ```text
/// <oid> /// <oid>
/// <oid>_<fork name> /// <oid>_<fork name>
/// <oid>.<segment number> /// <oid>.<segment number>
/// <oid>_<fork name>.<segment number> /// <oid>_<fork name>.<segment number>
/// ```
/// ///
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources. /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
/// ///

View File

@@ -1,6 +0,0 @@
//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
pub use super::PG_MAJORVERSION;
pub use super::xlog_utils::*;
pub use super::bindings::*;
pub use crate::WAL_SEGMENT_SIZE;

View File

@@ -481,4 +481,220 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
wal wal
} }
// If you need to craft WAL and write tests for this module, put it at wal_craft crate. #[cfg(test)]
mod tests {
use super::super::PG_MAJORVERSION;
use super::*;
use regex::Regex;
use std::cmp::min;
use std::fs;
use std::{env, str::FromStr};
use utils::const_assert;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
use wal_craft::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}
}

View File

@@ -15,7 +15,3 @@ postgres_ffi.workspace = true
tempfile.workspace = true tempfile.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
[dev-dependencies]
regex.workspace = true
utils.workspace = true

View File

@@ -10,20 +10,6 @@ use std::process::Command;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use tempfile::{tempdir, TempDir}; use tempfile::{tempdir, TempDir};
macro_rules! xlog_utils_test {
($version:ident) => {
#[path = "."]
mod $version {
pub use postgres_ffi::$version::wal_craft_test_export::*;
#[allow(clippy::duplicate_mod)]
#[cfg(test)]
mod xlog_utils_test;
}
};
}
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Conf { pub struct Conf {
pub pg_version: u32, pub pg_version: u32,

View File

@@ -1,219 +0,0 @@
//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
use super::*;
use crate::{error, info};
use regex::Regex;
use std::cmp::min;
use std::fs::{self, File};
use std::io::Write;
use std::{env, str::FromStr};
use utils::const_assert;
use utils::lsn::Lsn;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
use crate::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}

View File

@@ -5,11 +5,11 @@
//! It is similar to what tokio_util::codec::Framed with appropriate codec //! It is similar to what tokio_util::codec::Framed with appropriate codec
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
//! separately without using split from futures::stream::StreamExt (which //! separately without using split from futures::stream::StreamExt (which
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting //! allocates box[1] in polling internally). tokio::io::split is used for splitting
//! instead. Plus we customize error messages more than a single type for all io //! instead. Plus we customize error messages more than a single type for all io
//! calls. //! calls.
//! //!
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 //! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
use bytes::{Buf, BytesMut}; use bytes::{Buf, BytesMut};
use std::{ use std::{
future::Future, future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> { impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
/// Split into owned read and write parts. Beware of potential issues with /// Split into owned read and write parts. Beware of potential issues with
/// using halves in different tasks on TLS stream: /// using halves in different tasks on TLS stream:
/// <https://github.com/tokio-rs/tls/issues/40> /// https://github.com/tokio-rs/tls/issues/40
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) { pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
let (read_half, write_half) = tokio::io::split(self.stream); let (read_half, write_half) = tokio::io::split(self.stream);
let reader = FramedReader { let reader = FramedReader {

View File

@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
#[derive(Debug)] #[derive(Debug)]
pub struct FeCloseMessage; pub struct FeCloseMessage;
/// An error occurred while parsing or serializing raw stream into Postgres /// An error occured while parsing or serializing raw stream into Postgres
/// messages. /// messages.
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
pub enum ProtocolError { pub enum ProtocolError {
@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
} }
} }
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@@ -974,3 +965,12 @@ mod tests {
assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]); assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
} }
} }
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}

View File

@@ -12,7 +12,6 @@ aws-smithy-http.workspace = true
aws-types.workspace = true aws-types.workspace = true
aws-config.workspace = true aws-config.workspace = true
aws-sdk-s3.workspace = true aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
hyper = { workspace = true, features = ["stream"] } hyper = { workspace = true, features = ["stream"] }
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true

View File

@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// Currently, sync happens with AWS S3, that has two limits on requests per second: /// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services /// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html> /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/> /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// No limits on the client side, which currenltly means 1000 for AWS S3. /// No limits on the client side, which currenltly means 1000 for AWS S3.
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax> /// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None; pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf); pub struct RemotePath(PathBuf);
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
}
}
impl RemotePath { impl RemotePath {
pub fn new(relative_path: &Path) -> anyhow::Result<Self> { pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
anyhow::ensure!( anyhow::ensure!(
@@ -76,14 +70,6 @@ impl RemotePath {
pub fn join(&self, segment: &Path) -> Self { pub fn join(&self, segment: &Path) -> Self {
Self(self.0.join(segment)) Self(self.0.join(segment))
} }
pub fn get_path(&self) -> &PathBuf {
&self.0
}
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
} }
/// Storage (potentially remote) API to manage its state. /// Storage (potentially remote) API to manage its state.
@@ -100,19 +86,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError>; ) -> Result<Vec<RemotePath>, DownloadError>;
/// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes,
/// because it is for listing files instead of listing
/// names sharing common prefixes.
/// For example,
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
/// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details.
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
/// Streams the local file contents into remote into the remote storage entry. /// Streams the local file contents into remote into the remote storage entry.
async fn upload( async fn upload(
&self, &self,
@@ -138,8 +111,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
) -> Result<Download, DownloadError>; ) -> Result<Download, DownloadError>;
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
} }
pub struct Download { pub struct Download {
@@ -201,14 +172,6 @@ impl GenericRemoteStorage {
} }
} }
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
pub async fn upload( pub async fn upload(
&self, &self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -260,14 +223,6 @@ impl GenericRemoteStorage {
Self::Unreliable(s) => s.delete(path).await, Self::Unreliable(s) => s.delete(path).await,
} }
} }
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_objects(paths).await,
Self::AwsS3(s) => s.delete_objects(paths).await,
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
} }
impl GenericRemoteStorage { impl GenericRemoteStorage {

View File

@@ -7,7 +7,6 @@
use std::{ use std::{
borrow::Cow, borrow::Cow,
future::Future, future::Future,
io::ErrorKind,
path::{Path, PathBuf}, path::{Path, PathBuf},
pin::Pin, pin::Pin,
}; };
@@ -18,7 +17,7 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
}; };
use tracing::*; use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; use utils::crashsafe::path_with_suffix_extension;
use crate::{Download, DownloadError, RemotePath}; use crate::{Download, DownloadError, RemotePath};
@@ -49,14 +48,6 @@ impl LocalFs {
Ok(Self { storage_root }) Ok(Self { storage_root })
} }
// mirrors S3Bucket::s3_object_to_relative_path
fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
let relative_path = key
.strip_prefix(&self.storage_root)
.expect("relative path must contain storage_root as prefix");
RemotePath(relative_path.into())
}
async fn read_storage_metadata( async fn read_storage_metadata(
&self, &self,
file_path: &Path, file_path: &Path,
@@ -110,60 +101,19 @@ impl RemoteStorage for LocalFs {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root), None => Cow::Borrowed(&self.storage_root),
}; };
Ok(get_all_files(path.as_ref(), false)
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await .await
.map_err(DownloadError::Other)?; .map_err(DownloadError::Other)?
.into_iter()
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len()); .map(|path| {
path.strip_prefix(&self.storage_root)
// filter out empty directories to mirror s3 behavior. .context("Failed to strip preifix")
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
prefixes.push(
prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new) .and_then(RemotePath::new)
.expect( .expect(
"We list files for storage root, hence should be able to remote the prefix", "We list files for storage root, hence should be able to remote the prefix",
), )
) })
} .collect())
Ok(prefixes)
}
// recursively lists all files in a directory,
// mirroring the `list_files` for `s3_bucket`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let full_path = match folder {
Some(folder) => folder.with_base(&self.storage_root),
None => self.storage_root.clone(),
};
let mut files = vec![];
let mut directory_queue = vec![full_path.clone()];
while let Some(cur_folder) = directory_queue.pop() {
let mut entries = fs::read_dir(cur_folder.clone()).await?;
while let Some(entry) = entries.next_entry().await? {
let file_name: PathBuf = entry.file_name().into();
let full_file_name = cur_folder.clone().join(&file_name);
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
files.push(file_remote_path.clone());
if full_file_name.is_dir() {
directory_queue.push(full_file_name);
}
}
}
Ok(files)
} }
async fn upload( async fn upload(
@@ -341,22 +291,12 @@ impl RemoteStorage for LocalFs {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let file_path = path.with_base(&self.storage_root); let file_path = path.with_base(&self.storage_root);
match fs::remove_file(&file_path).await { if file_path.exists() && file_path.is_file() {
Ok(()) => Ok(()), Ok(fs::remove_file(file_path).await?)
// The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour. } else {
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html bail!("File {file_path:?} either does not exist or is not a file")
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
Err(e) => Err(anyhow::anyhow!(e)),
} }
} }
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
for path in paths {
self.delete(path).await?
}
Ok(())
}
} }
fn storage_metadata_path(original_path: &Path) -> PathBuf { fn storage_metadata_path(original_path: &Path) -> PathBuf {
@@ -380,7 +320,7 @@ where
let file_type = dir_entry.file_type().await?; let file_type = dir_entry.file_type().await?;
let entry_path = dir_entry.path(); let entry_path = dir_entry.path();
if file_type.is_symlink() { if file_type.is_symlink() {
debug!("{entry_path:?} is a symlink, skipping") debug!("{entry_path:?} us a symlink, skipping")
} else if file_type.is_dir() { } else if file_type.is_dir() {
if recursive { if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter()) paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -655,11 +595,15 @@ mod fs_tests {
storage.delete(&upload_target).await?; storage.delete(&upload_target).await?;
assert!(storage.list().await?.is_empty()); assert!(storage.list().await?.is_empty());
storage match storage.delete(&upload_target).await {
.delete(&upload_target) Ok(()) => panic!("Should not allow deleting non-existing storage files"),
.await Err(e) => {
.expect("Should allow deleting non-existing storage files"); let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
let expected_path = upload_target.with_base(&storage.storage_root);
assert!(error_string.contains(expected_path.to_str().unwrap()));
}
}
Ok(()) Ok(())
} }

View File

@@ -9,16 +9,14 @@ use std::sync::Arc;
use anyhow::Context; use anyhow::Context;
use aws_config::{ use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider, environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, imds::credentials::ImdsCredentialsProvider,
meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
}; };
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{ use aws_sdk_s3::{
config::{Config, Region}, config::Config,
error::SdkError, error::{GetObjectError, GetObjectErrorKind},
operation::get_object::GetObjectError, types::{ByteStream, SdkError},
primitives::ByteStream, Client, Endpoint, Region,
types::{Delete, ObjectIdentifier},
Client,
}; };
use aws_smithy_http::body::SdkBody; use aws_smithy_http::body::SdkBody;
use hyper::Body; use hyper::Body;
@@ -34,8 +32,6 @@ use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR, Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
}; };
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics { pub(super) mod metrics {
use metrics::{register_int_counter_vec, IntCounterVec}; use metrics::{register_int_counter_vec, IntCounterVec};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@@ -84,24 +80,12 @@ pub(super) mod metrics {
.inc(); .inc();
} }
pub fn inc_delete_objects(count: u64) {
S3_REQUESTS_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_delete_object_fail() { pub fn inc_delete_object_fail() {
S3_REQUESTS_FAIL_COUNT S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"]) .with_label_values(&["delete_object"])
.inc(); .inc();
} }
pub fn inc_delete_objects_fail(count: u64) {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_list_objects() { pub fn inc_list_objects() {
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
} }
@@ -141,23 +125,28 @@ impl S3Bucket {
let credentials_provider = { let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try( let env_creds = EnvironmentVariableCredentialsProvider::new();
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses imds v2 // uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build()) let imds = ImdsCredentialsProvider::builder().build();
// finally add caching.
// this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
LazyCachingCredentialsProvider::builder()
.load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
.build()
}; };
let mut config_builder = Config::builder() let mut config_builder = Config::builder()
.region(Region::new(aws_config.bucket_region.clone())) .region(Region::new(aws_config.bucket_region.clone()))
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider); .credentials_provider(credentials_provider);
if let Some(custom_endpoint) = aws_config.endpoint.clone() { if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder let endpoint = Endpoint::immutable(
.endpoint_url(custom_endpoint) custom_endpoint
.force_path_style(true); .parse()
.expect("Failed to parse S3 custom endpoint"),
);
config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
} }
let client = Client::from_conf(config_builder.build()); let client = Client::from_conf(config_builder.build());
@@ -200,17 +189,13 @@ impl S3Bucket {
) )
} }
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
let path_string = path for segment in path.0.iter() {
.get_path() full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
.to_string_lossy() full_path.push_str(segment.to_str().unwrap_or_default());
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
.to_string();
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + &path_string,
None => path_string,
} }
full_path
} }
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> { async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -244,9 +229,14 @@ impl S3Bucket {
))), ))),
}) })
} }
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { Err(SdkError::ServiceError {
Err(DownloadError::NotFound) err:
} GetObjectError {
kind: GetObjectErrorKind::NoSuchKey(..),
..
},
..
}) => Err(DownloadError::NotFound),
Err(e) => { Err(e) => {
metrics::inc_get_object_fail(); metrics::inc_get_object_fail();
Err(DownloadError::Other(anyhow::anyhow!( Err(DownloadError::Other(anyhow::anyhow!(
@@ -351,51 +341,6 @@ impl RemoteStorage for S3Bucket {
Ok(document_keys) Ok(document_keys)
} }
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
// AWS may need to break the response into several parts
let mut continuation_token = None;
let mut all_files = vec![];
loop {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
metrics::inc_list_objects();
let response = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(folder_name.clone())
.set_continuation_token(continuation_token)
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})
.context("Failed to list files in S3 bucket")?;
for object in response.contents().unwrap_or_default() {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
all_files.push(remote_path);
}
match response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(all_files)
}
async fn upload( async fn upload(
&self, &self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -431,12 +376,10 @@ impl RemoteStorage for S3Bucket {
} }
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> { async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`
self.download_object(GetObjectRequest { self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(), bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from), key: self.relative_path_to_s3_object(from),
range: None, ..GetObjectRequest::default()
}) })
.await .await
} }
@@ -462,50 +405,6 @@ impl RemoteStorage for S3Bucket {
}) })
.await .await
} }
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 delete")?;
let mut delete_objects = Vec::with_capacity(paths.len());
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build();
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
metrics::inc_delete_objects(chunk.len() as u64);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
.send()
.await;
match resp {
Ok(resp) => {
if let Some(errors) = resp.errors {
metrics::inc_delete_objects_fail(errors.len() as u64);
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {
metrics::inc_delete_objects_fail(chunk.len() as u64);
return Err(e.into());
}
}
}
Ok(())
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let _guard = self let _guard = self
@@ -529,63 +428,3 @@ impl RemoteStorage for S3Bucket {
Ok(()) Ok(())
} }
} }
#[cfg(test)]
mod tests {
use std::num::NonZeroUsize;
use std::path::Path;
use crate::{RemotePath, S3Bucket, S3Config};
#[test]
fn relative_path() {
let all_paths = vec!["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
.collect();
let prefixes = [
None,
Some(""),
Some("test/prefix"),
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = vec![
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
];
for (prefix_idx, prefix) in prefixes.iter().enumerate() {
let config = S3Config {
bucket_name: "bucket".to_owned(),
bucket_region: "region".to_owned(),
prefix_in_bucket: prefix.map(str::to_string),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
};
let storage = S3Bucket::new(&config).expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];
assert_eq!(result, expected);
}
}
}
}

View File

@@ -24,7 +24,6 @@ enum RemoteOp {
Upload(RemotePath), Upload(RemotePath),
Download(RemotePath), Download(RemotePath),
Delete(RemotePath), Delete(RemotePath),
DeleteObjects(Vec<RemotePath>),
} }
impl UnreliableWrapper { impl UnreliableWrapper {
@@ -83,11 +82,6 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.list_prefixes(prefix).await self.inner.list_prefixes(prefix).await
} }
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
self.inner.list_files(folder).await
}
async fn upload( async fn upload(
&self, &self,
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -125,21 +119,4 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Delete(path.clone()))?; self.attempt(RemoteOp::Delete(path.clone()))?;
self.inner.delete(path).await self.inner.delete(path).await
} }
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
let mut error_counter = 0;
for path in paths {
if (self.delete(path).await).is_err() {
error_counter += 1;
}
}
if error_counter > 0 {
return Err(anyhow::anyhow!(
"failed to delete {} objects",
error_counter
));
}
Ok(())
}
} }

View File

@@ -0,0 +1,274 @@
use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::{test_context, AsyncTestContext};
use tokio::task::JoinSet;
use tracing::{debug, error, info};
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
/// See the client creation in [`create_s3_client`] for details on the required env vars.
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
///
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
/// where
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
///
/// Then, verifies that the client does return correct prefixes when queried:
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
///
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
/// since current default AWS S3 pagination limit is 1000.
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
///
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
};
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix =
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None)
.await
.context("client list root prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
);
let nested_remote_prefixes = test_client
.list_prefixes(Some(&base_prefix))
.await
.context("client list nested prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
let remote_only_prefixes = nested_remote_prefixes
.difference(&expected_remote_prefixes)
.collect::<HashSet<_>>();
let missing_uploaded_prefixes = expected_remote_prefixes
.difference(&nested_remote_prefixes)
.collect::<HashSet<_>>();
assert_eq!(
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
);
Ok(())
}
enum MaybeEnabledS3 {
Enabled(S3WithTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithTestBlobs),
}
struct S3WithTestBlobs {
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
remote_prefixes: HashSet<RemotePath>,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3 {
async fn setup() -> Self {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
)
.expect("logging init failed");
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
.context("S3 client creation")
.expect("S3 client creation failed");
let base_prefix_str = "test/";
match upload_s3_data(
&client_with_excessive_pagination,
base_prefix_str,
upload_tasks_count,
)
.await
{
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
})
}
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
},
),
}
}
async fn teardown(self) {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
}
}
}
}
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(max_keys_per_list_response),
}),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
))
}
struct Uploads {
prefixes: HashSet<RemotePath>,
blobs: HashSet<RemotePath>,
}
async fn upload_s3_data(
client: &Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
upload_tasks_count: usize,
) -> ControlFlow<Uploads, Uploads> {
info!("Creating {upload_tasks_count} S3 files");
let mut upload_tasks = JoinSet::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
let blob_prefix = RemotePath::new(&prefix)
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
}
let mut upload_tasks_failed = false;
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
while let Some(task_run_result) = upload_tasks.join_next().await {
match task_run_result
.context("task join failed")
.and_then(|task_result| task_result.context("upload task failed"))
{
Ok((upload_prefix, upload_path)) => {
uploaded_prefixes.insert(upload_prefix);
uploaded_blobs.insert(upload_path);
}
Err(e) => {
error!("Upload task failed: {e:?}");
upload_tasks_failed = true;
}
}
}
let uploads = Uploads {
prefixes: uploaded_prefixes,
blobs: uploaded_blobs,
};
if upload_tasks_failed {
ControlFlow::Break(uploads)
} else {
ControlFlow::Continue(uploads)
}
}
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
info!(
"Removing {} objects from the remote storage during cleanup",
objects_to_delete.len()
);
let mut delete_tasks = JoinSet::new();
for object_to_delete in objects_to_delete {
let task_client = Arc::clone(client);
delete_tasks.spawn(async move {
debug!("Deleting remote item at path {object_to_delete:?}");
task_client
.delete(&object_to_delete)
.await
.with_context(|| format!("{object_to_delete:?} removal"))
});
}
while let Some(task_run_result) = delete_tasks.join_next().await {
match task_run_result {
Ok(task_result) => match task_result {
Ok(()) => {}
Err(e) => error!("Delete task failed: {e:?}"),
},
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
}
}
}

View File

@@ -1,542 +0,0 @@
use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use once_cell::sync::OnceCell;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::{test_context, AsyncTestContext};
use tokio::task::JoinSet;
use tracing::{debug, error, info};
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
/// See the client creation in [`create_s3_client`] for details on the required env vars.
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
///
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
/// where
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
///
/// Then, verifies that the client does return correct prefixes when queried:
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
///
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
/// since current default AWS S3 pagination limit is 1000.
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
///
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
#[test_context(MaybeEnabledS3WithTestBlobs)]
#[tokio::test]
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
};
let test_client = Arc::clone(&ctx.enabled.client);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None)
.await
.context("client list root prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
);
let nested_remote_prefixes = test_client
.list_prefixes(Some(&base_prefix))
.await
.context("client list nested prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
let remote_only_prefixes = nested_remote_prefixes
.difference(&expected_remote_prefixes)
.collect::<HashSet<_>>();
let missing_uploaded_prefixes = expected_remote_prefixes
.difference(&nested_remote_prefixes)
.collect::<HashSet<_>>();
assert_eq!(
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
);
Ok(())
}
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
/// See `s3_pagination_should_work` for more information.
///
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
/// Then performs the following queries:
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
#[tokio::test]
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let test_client = Arc::clone(&ctx.enabled.client);
let base_prefix =
RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list_files(None)
.await
.context("client list root files failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_files,
ctx.remote_blobs.clone(),
"remote storage list_files on root mismatches with the uploads."
);
let nested_remote_files = test_client
.list_files(Some(&base_prefix))
.await
.context("client list nested files failure")?
.into_iter()
.collect::<HashSet<_>>();
let trim_remote_blobs: HashSet<_> = ctx
.remote_blobs
.iter()
.map(|x| x.get_path().to_str().expect("must be valid name"))
.filter(|x| x.starts_with("folder1"))
.map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
.collect();
assert_eq!(
nested_remote_files, trim_remote_blobs,
"remote storage list_files on subdirrectory mismatches with the uploads."
);
Ok(())
}
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
};
let path = RemotePath::new(&PathBuf::from(format!(
"{}/for_sure_there_is_nothing_there_really",
ctx.base_prefix,
)))
.with_context(|| "RemotePath conversion")?;
ctx.client.delete(&path).await.expect("should succeed");
Ok(())
}
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
};
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
let data3 = "remote blob data3".as_bytes();
let data3_len = data3.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
ctx.client
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
let prefixes = ctx.client.list_prefixes(None).await?;
assert_eq!(prefixes.len(), 1);
ctx.client.delete_objects(&[path3]).await?;
Ok(())
}
fn ensure_logging_ready() {
LOGGING_DONE.get_or_init(|| {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
)
.expect("logging init failed");
});
}
struct EnabledS3 {
client: Arc<GenericRemoteStorage>,
base_prefix: &'static str,
}
impl EnabledS3 {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_s3_client(max_keys_in_list_response)
.context("S3 client creation")
.expect("S3 client creation failed");
EnabledS3 {
client,
base_prefix: BASE_PREFIX,
}
}
}
enum MaybeEnabledS3 {
Enabled(EnabledS3),
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3 {
async fn setup() -> Self {
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
Self::Enabled(EnabledS3::setup(None).await)
}
}
enum MaybeEnabledS3WithTestBlobs {
Enabled(S3WithTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithTestBlobs),
}
struct S3WithTestBlobs {
enabled: EnabledS3,
remote_prefixes: HashSet<RemotePath>,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithTestBlobs {
enabled,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
})
}
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithTestBlobs {
enabled,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
},
),
}
}
async fn teardown(self) {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
}
}
}
}
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledS3WithSimpleTestBlobs {
Enabled(S3WithSimpleTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
}
struct S3WithSimpleTestBlobs {
enabled: EnabledS3,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithSimpleTestBlobs {
enabled,
remote_blobs: uploads,
})
}
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithSimpleTestBlobs {
enabled,
remote_blobs: uploads,
},
),
}
}
async fn teardown(self) {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
}
}
}
}
fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_nanos();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
}),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
))
}
struct Uploads {
prefixes: HashSet<RemotePath>,
blobs: HashSet<RemotePath>,
}
async fn upload_s3_data(
client: &Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
upload_tasks_count: usize,
) -> ControlFlow<Uploads, Uploads> {
info!("Creating {upload_tasks_count} S3 files");
let mut upload_tasks = JoinSet::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
let blob_prefix = RemotePath::new(&prefix)
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
}
let mut upload_tasks_failed = false;
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
while let Some(task_run_result) = upload_tasks.join_next().await {
match task_run_result
.context("task join failed")
.and_then(|task_result| task_result.context("upload task failed"))
{
Ok((upload_prefix, upload_path)) => {
uploaded_prefixes.insert(upload_prefix);
uploaded_blobs.insert(upload_path);
}
Err(e) => {
error!("Upload task failed: {e:?}");
upload_tasks_failed = true;
}
}
}
let uploads = Uploads {
prefixes: uploaded_prefixes,
blobs: uploaded_blobs,
};
if upload_tasks_failed {
ControlFlow::Break(uploads)
} else {
ControlFlow::Continue(uploads)
}
}
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
info!(
"Removing {} objects from the remote storage during cleanup",
objects_to_delete.len()
);
let mut delete_tasks = JoinSet::new();
for object_to_delete in objects_to_delete {
let task_client = Arc::clone(client);
delete_tasks.spawn(async move {
debug!("Deleting remote item at path {object_to_delete:?}");
task_client
.delete(&object_to_delete)
.await
.with_context(|| format!("{object_to_delete:?} removal"))
});
}
while let Some(task_run_result) = delete_tasks.join_next().await {
match task_run_result {
Ok(task_result) => match task_result {
Ok(()) => {}
Err(e) => error!("Delete task failed: {e:?}"),
},
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
}
}
}
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
async fn upload_simple_s3_data(
client: &Arc<GenericRemoteStorage>,
upload_tasks_count: usize,
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
info!("Creating {upload_tasks_count} S3 files");
let mut upload_tasks = JoinSet::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
let blob_path = RemotePath::new(&blob_path)
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
}
let mut upload_tasks_failed = false;
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
while let Some(task_run_result) = upload_tasks.join_next().await {
match task_run_result
.context("task join failed")
.and_then(|task_result| task_result.context("upload task failed"))
{
Ok(upload_path) => {
uploaded_blobs.insert(upload_path);
}
Err(e) => {
error!("Upload task failed: {e:?}");
upload_tasks_failed = true;
}
}
}
if upload_tasks_failed {
ControlFlow::Break(uploaded_blobs)
} else {
ControlFlow::Continue(uploaded_blobs)
}
}

View File

@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
// 2. D+C+a+b // 2. D+C+a+b
// 3. D+A+B // 3. D+A+B
/// `Segment` which has had its size calculated. /// [`Segment`] which has had it's size calculated.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct SegmentSize { struct SegmentSize {
method: SegmentMethod, method: SegmentMethod,

View File

@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
/// directly into HTTP servers. However, I couldn't find one for Hyper, /// directly into HTTP servers. However, I couldn't find one for Hyper,
/// so I had to write our own. OpenTelemetry website has a registry of /// so I had to write our own. OpenTelemetry website has a registry of
/// instrumentation libraries at: /// instrumentation libraries at:
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation> /// https://opentelemetry.io/registry/?language=rust&component=instrumentation
/// If a Hyper crate appears, consider switching to that. /// If a Hyper crate appears, consider switching to that.
pub async fn tracing_handler<F, R>( pub async fn tracing_handler<F, R>(
req: Request<Body>, req: Request<Body>,

View File

@@ -5,6 +5,7 @@ edition.workspace = true
license.workspace = true license.workspace = true
[dependencies] [dependencies]
atty.workspace = true
sentry.workspace = true sentry.workspace = true
async-trait.workspace = true async-trait.workspace = true
anyhow.workspace = true anyhow.workspace = true
@@ -40,12 +41,6 @@ pq_proto.workspace = true
metrics.workspace = true metrics.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
const_format.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
[dev-dependencies] [dev-dependencies]
byteorder.workspace = true byteorder.workspace = true
bytes.workspace = true bytes.workspace = true

View File

@@ -16,7 +16,7 @@ use crate::id::TenantId;
/// Algorithm to use. We require EdDSA. /// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum Scope { pub enum Scope {
// Provides access to all data for a specific tenant (specified in `struct Claims` below) // Provides access to all data for a specific tenant (specified in `struct Claims` below)

View File

@@ -1,49 +0,0 @@
use std::sync::Arc;
use tokio::sync::{mpsc, Mutex};
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(mpsc::Sender<()>);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Default for Barrier {
fn default() -> Self {
let (_, rx) = channel();
rx
}
}
impl Barrier {
pub async fn wait(self) {
self.0.lock().await.recv().await;
}
pub async fn maybe_wait(barrier: Option<Barrier>) {
if let Some(b) = barrier {
b.wait().await
}
}
}
impl PartialEq for Barrier {
fn eq(&self, other: &Self) -> bool {
// we don't use dyn so this is good
Arc::ptr_eq(&self.0, &other.0)
}
}
impl Eq for Barrier {}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let (tx, rx) = mpsc::channel::<()>(1);
let rx = Mutex::new(rx);
let rx = Arc::new(rx);
(Completion(tx), Barrier(rx))
}

View File

@@ -1,111 +0,0 @@
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
///
/// It can be used with `anyhow::Error` as well.
///
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
/// formatting.
///
/// ## Usage
///
/// ```rust
/// #[derive(Debug, thiserror::Error)]
/// enum MyCoolError {
/// #[error("should never happen")]
/// Bad(#[source] std::io::Error),
/// }
///
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
///
/// # fn main() {
/// use utils::error::report_compact_sources;
///
/// if let Err(e) = failing_call() {
/// let e = report_compact_sources(&e);
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
/// }
/// # }
/// ```
///
/// ## TODO
///
/// When we are able to describe return position impl trait in traits, this should of course be an
/// extension trait. Until then avoid boxing with this more ackward interface.
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)?;
// why is E a generic parameter here? hope that rustc will see through a default
// Error::source implementation and leave the following out if there cannot be any
// sources:
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
}
}
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
impl<'a> Iterator for Sources<'a> {
type Item = &'a (dyn std::error::Error + 'static);
fn next(&mut self) -> Option<Self::Item> {
let rem = self.0;
let next = self.0.and_then(|x| x.source());
self.0 = next;
rem
}
}
AnyhowDisplayAlternateAlike(e)
}
#[cfg(test)]
mod tests {
use super::report_compact_sources;
#[test]
fn report_compact_sources_examples() {
use std::fmt::Write;
#[derive(Debug, thiserror::Error)]
enum EvictionError {
#[error("cannot evict a remote layer")]
CannotEvictRemoteLayer,
#[error("stat failed")]
StatFailed(#[source] std::io::Error),
#[error("layer was no longer part of LayerMap")]
LayerNotFound(#[source] anyhow::Error),
}
let examples = [
(
line!(),
EvictionError::CannotEvictRemoteLayer,
"cannot evict a remote layer",
),
(
line!(),
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
"stat failed: permission denied",
),
(
line!(),
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
"layer was no longer part of LayerMap: foobar",
),
];
let mut s = String::new();
for (line, example, expected) in examples {
s.clear();
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
assert_eq!(s, expected, "example on line {line}");
}
}
}

View File

@@ -1,8 +1,6 @@
/// Extensions to `std::fs` types. /// Extensions to `std::fs` types.
use std::{fs, io, path::Path}; use std::{fs, io, path::Path};
use anyhow::Context;
pub trait PathExt { pub trait PathExt {
/// Returns an error if `self` is not a directory. /// Returns an error if `self` is not a directory.
fn is_empty_dir(&self) -> io::Result<bool>; fn is_empty_dir(&self) -> io::Result<bool>;
@@ -17,36 +15,10 @@ where
} }
} }
pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
let mut dir = tokio::fs::read_dir(&path)
.await
.context(format!("read_dir({})", path.as_ref().display()))?;
Ok(dir.next_entry().await?.is_none())
}
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
if e.kind() == io::ErrorKind::NotFound {
Ok(())
} else {
Err(e)
}
}
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
where
F: Fn() -> io::Result<()>,
{
fs_operation().or_else(ignore_not_found)
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::path::PathBuf; use std::path::PathBuf;
use crate::fs_ext::is_directory_empty;
use super::ignore_absent_files;
#[test] #[test]
fn is_empty_dir() { fn is_empty_dir() {
use super::PathExt; use super::PathExt;
@@ -70,43 +42,4 @@ mod test {
std::fs::remove_file(&file_path).unwrap(); std::fs::remove_file(&file_path).unwrap();
assert!(file_path.is_empty_dir().is_err()); assert!(file_path.is_empty_dir().is_err());
} }
#[tokio::test]
async fn is_empty_dir_async() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
// test positive case
assert!(
is_directory_empty(dir_path).await.expect("test failure"),
"new tempdir should be empty"
);
// invoke on a file to ensure it returns an error
let file_path: PathBuf = dir_path.join("testfile");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
assert!(is_directory_empty(&file_path).await.is_err());
// do it again on a path, we know to be nonexistent
std::fs::remove_file(&file_path).unwrap();
assert!(is_directory_empty(file_path).await.is_err());
}
#[test]
fn ignore_absent_files_works() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
let file_path: PathBuf = dir_path.join("testfile");
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
assert!(!file_path.exists());
}
} }

View File

@@ -1,19 +1,23 @@
use crate::auth::{Claims, JwtAuth}; use crate::auth::{Claims, JwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use crate::http::error;
use anyhow::Context; use anyhow::{anyhow, Context};
use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue; use hyper::http::HeaderValue;
use hyper::Method; use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use routerify::ext::RequestExt; use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
use tokio::task::JoinError;
use tracing::{self, debug, info, info_span, warn, Instrument}; use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future; use std::future::Future;
use std::net::TcpListener;
use std::str::FromStr; use std::str::FromStr;
use super::error::ApiError;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| { static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!( register_int_counter!(
"libmetrics_metric_handler_requests_total", "libmetrics_metric_handler_requests_total",
@@ -31,18 +35,8 @@ struct RequestId(String);
/// Adds a tracing info_span! instrumentation around the handler events, /// Adds a tracing info_span! instrumentation around the handler events,
/// logs the request start and end events for non-GET requests and non-200 responses. /// logs the request start and end events for non-GET requests and non-200 responses.
/// ///
/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
///
/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
/// with this will get request info logged in the wrapping span, including the unique request ID. /// in this type will get request info logged in the wrapping span, including the unique request ID.
///
/// This also handles errors, logging them and converting them to an HTTP error response.
///
/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
/// completion. In other words, the handler must be async cancellation safe! request_span
/// prints a warning to the log when that happens, so that you have some trace of it in
/// the log.
///
/// ///
/// There could be other ways to implement similar functionality: /// There could be other ways to implement similar functionality:
/// ///
@@ -60,56 +54,60 @@ struct RequestId(String);
/// tries to achive with its `.instrument` used in the current approach. /// tries to achive with its `.instrument` used in the current approach.
/// ///
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output pub struct RequestSpan<E, R, H>(pub H)
where where
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static, E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static, R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
H: Fn(Request<Body>) -> R + Send + Sync + 'static;
impl<E, R, H> RequestSpan<E, R, H>
where
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
H: Fn(Request<Body>) -> R + Send + Sync + 'static,
{ {
let request_id = request.context::<RequestId>().unwrap_or_default().0; /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
let method = request.method(); /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
let path = request.uri().path(); pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
let request_span = info_span!("request", %method, %path, %request_id); let request_id = request.context::<RequestId>().unwrap_or_default().0;
let method = request.method();
let path = request.uri().path();
let request_span = info_span!("request", %method, %path, %request_id);
let log_quietly = method == Method::GET; let log_quietly = method == Method::GET;
async move { async move {
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
if log_quietly { if log_quietly {
debug!("Handling request"); debug!("Handling request");
} else { } else {
info!("Handling request"); info!("Handling request");
} }
// No special handling for panics here. There's a `tracing_panic_hook` from another // Note that we reuse `error::handler` here and not returning and error at all,
// module to do that globally. // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
let res = handler(request).await; // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
//
cancellation_guard.disarm(); // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
let res = (self.0)(request).await;
// Log the result if needed.
// cancellation_guard.disarm();
// We also convert any errors into an Ok response with HTTP error code here.
// `make_router` sets a last-resort error handler that would do the same, but match res {
// we prefer to do it here, before we exit the request span, so that the error Ok(response) => {
// is still logged with the span. let response_status = response.status();
// if log_quietly && response_status.is_success() {
// (Because we convert errors to Ok response, we never actually return an error, debug!("Request handled, status: {response_status}");
// and we could declare the function to return the never type (`!`). However, } else {
// using `routerify::RouterBuilder` requires a proper error type.) info!("Request handled, status: {response_status}");
match res { }
Ok(response) => { Ok(response)
let response_status = response.status(); }
if log_quietly && response_status.is_success() { Err(e) => Ok(error::handler(e.into()).await),
debug!("Request handled, status: {response_status}");
} else {
info!("Request handled, status: {response_status}");
}
Ok(response)
} }
Err(err) => Ok(api_error_handler(err)),
} }
.instrument(request_span)
.await
} }
.instrument(request_span)
.await
} }
/// Drop guard to WARN in case the request was dropped before completion. /// Drop guard to WARN in case the request was dropped before completion.
@@ -147,140 +145,26 @@ impl Drop for RequestCancelled {
} }
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
SERVE_METRICS_COUNT.inc(); SERVE_METRICS_COUNT.inc();
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks. let mut buffer = vec![];
struct ChannelWriter {
buffer: BytesMut,
tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
}
impl ChannelWriter {
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
assert_ne!(buf_len, 0);
ChannelWriter {
// split about half off the buffer from the start, because we flush depending on
// capacity. first flush will come sooner than without this, but now resizes will
// have better chance of picking up the "other" half. not guaranteed of course.
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
}
}
fn flush0(&mut self) -> std::io::Result<usize> {
let n = self.buffer.len();
if n == 0 {
return Ok(0);
}
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
// throttle sending to allow reuse of our buffer in `write`.
self.tx.reserve().await.map_err(|_| ())?;
// now the response task has picked up the buffer and hopefully started
// sending it to the client.
Ok(())
});
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
self.written += n;
Ok(n)
}
fn flushed_bytes(&self) -> usize {
self.written
}
}
impl std::io::Write for ChannelWriter {
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
let remaining = self.buffer.capacity() - self.buffer.len();
let out_of_space = remaining < buf.len();
let original_len = buf.len();
if out_of_space {
let can_still_fit = buf.len() - remaining;
self.buffer.extend_from_slice(&buf[..can_still_fit]);
buf = &buf[can_still_fit..];
self.flush0()?;
}
// assume that this will often under normal operation just move the pointer back to the
// beginning of allocation, because previous split off parts are already sent and
// dropped.
self.buffer.extend_from_slice(buf);
Ok(original_len)
}
fn flush(&mut self) -> std::io::Result<()> {
self.flush0().map(|_| ())
}
}
let started_at = std::time::Instant::now();
let (tx, rx) = mpsc::channel(1);
let body = Body::wrap_stream(ReceiverStream::new(rx));
let mut writer = ChannelWriter::new(128 * 1024, tx);
let encoder = TextEncoder::new(); let encoder = TextEncoder::new();
let metrics = tokio::task::spawn_blocking(move || {
// Currently we take a lot of mutexes while collecting metrics, so it's
// better to spawn a blocking task to avoid blocking the event loop.
metrics::gather()
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
encoder.encode(&metrics, &mut buffer).unwrap();
let response = Response::builder() let response = Response::builder()
.status(200) .status(200)
.header(CONTENT_TYPE, encoder.format_type()) .header(CONTENT_TYPE, encoder.format_type())
.body(body) .body(Body::from(buffer))
.unwrap(); .unwrap();
let span = info_span!("blocking");
tokio::task::spawn_blocking(move || {
let _span = span.entered();
let metrics = metrics::gather();
let res = encoder
.encode(&metrics, &mut writer)
.and_then(|_| writer.flush().map_err(|e| e.into()));
match res {
Ok(()) => {
tracing::info!(
bytes = writer.flushed_bytes(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /metrics"
);
}
Err(e) => {
tracing::warn!("failed to write out /metrics response: {e:#}");
// semantics of this error are quite... unclear. we want to error the stream out to
// abort the response to somehow notify the client that we failed.
//
// though, most likely the reason for failure is that the receiver is already gone.
drop(
writer
.tx
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
);
}
}
});
Ok(response) Ok(response)
} }
@@ -323,8 +207,10 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
.middleware(Middleware::post_with_info( .middleware(Middleware::post_with_info(
add_request_id_header_to_response, add_request_id_header_to_response,
)) ))
.get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/metrics", |r| {
.err_handler(route_error_handler) RequestSpan(prometheus_metrics_handler).handle(r)
})
.err_handler(error::handler)
} }
pub fn attach_openapi_ui( pub fn attach_openapi_ui(
@@ -334,14 +220,12 @@ pub fn attach_openapi_ui(
ui_mount_path: &'static str, ui_mount_path: &'static str,
) -> RouterBuilder<hyper::Body, ApiError> { ) -> RouterBuilder<hyper::Body, ApiError> {
router_builder router_builder
.get(spec_mount_path, .get(spec_mount_path, move |r| {
move |r| request_span(r, move |_| async move { RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
Ok(Response::builder().body(Body::from(spec)).unwrap()) .handle(r)
}) })
) .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
.get(ui_mount_path, Ok(Response::builder().body(Body::from(format!(r#"
move |r| request_span(r, move |_| async move {
Ok(Response::builder().body(Body::from(format!(r#"
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en"> <html lang="en">
<head> <head>
@@ -371,8 +255,7 @@ pub fn attach_openapi_ui(
</body> </body>
</html> </html>
"#, spec_mount_path))).unwrap()) "#, spec_mount_path))).unwrap())
}) }).handle(r))
)
} }
fn parse_token(header_value: &str) -> Result<&str, ApiError> { fn parse_token(header_value: &str) -> Result<&str, ApiError> {
@@ -460,6 +343,40 @@ pub fn check_permission_with(
} }
} }
///
/// Start listening for HTTP requests on given socket.
///
/// 'shutdown_future' can be used to stop. If the Future becomes
/// ready, we stop listening for new requests, and the function returns.
///
pub fn serve_thread_main<S>(
router_builder: RouterBuilder<hyper::Body, ApiError>,
listener: TcpListener,
shutdown_future: S,
) -> anyhow::Result<()>
where
S: Future<Output = ()> + Send + Sync,
{
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
// Enter a single-threaded tokio runtime bound to the current thread
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let _guard = runtime.enter();
let server = Server::from_tcp(listener)?
.serve(service)
.with_graceful_shutdown(shutdown_future);
runtime.block_on(server)?;
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

View File

@@ -1,6 +1,5 @@
use hyper::{header, Body, Response, StatusCode}; use hyper::{header, Body, Response, StatusCode};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::error::Error as StdError;
use thiserror::Error; use thiserror::Error;
use tracing::error; use tracing::error;
@@ -16,13 +15,13 @@ pub enum ApiError {
Unauthorized(String), Unauthorized(String),
#[error("NotFound: {0}")] #[error("NotFound: {0}")]
NotFound(Box<dyn StdError + Send + Sync + 'static>), NotFound(anyhow::Error),
#[error("Conflict: {0}")] #[error("Conflict: {0}")]
Conflict(String), Conflict(String),
#[error("Precondition failed: {0}")] #[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>), PreconditionFailed(&'static str),
#[error(transparent)] #[error(transparent)]
InternalServerError(anyhow::Error), InternalServerError(anyhow::Error),
@@ -84,24 +83,13 @@ impl HttpErrorBody {
} }
} }
pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> { pub async fn handler(err: routerify::RouteError) -> Response<Body> {
match err.downcast::<ApiError>() { let api_error = err
Ok(api_error) => api_error_handler(*api_error), .downcast::<ApiError>()
Err(other_error) => { .expect("handler should always return api error");
// We expect all the request handlers to return an ApiError, so this should
// not be reached. But just in case.
error!("Error processing HTTP request: {other_error:?}");
HttpErrorBody::response_from_msg_and_status(
other_error.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
)
}
}
}
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors // Print a stack trace for Internal Server errors
if let ApiError::InternalServerError(_) = api_error { if let ApiError::InternalServerError(_) = api_error.as_ref() {
error!("Error processing HTTP request: {api_error:?}"); error!("Error processing HTTP request: {api_error:?}");
} else { } else {
error!("Error processing HTTP request: {api_error:#}"); error!("Error processing HTTP request: {api_error:#}");

View File

@@ -8,26 +8,12 @@ use super::error::ApiError;
pub async fn json_request<T: for<'de> Deserialize<'de>>( pub async fn json_request<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>, request: &mut Request<Body>,
) -> Result<T, ApiError> { ) -> Result<T, ApiError> {
json_request_or_empty_body(request) let whole_body = hyper::body::aggregate(request.body_mut())
.await?
.context("missing request body")
.map_err(ApiError::BadRequest)
}
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>,
) -> Result<Option<T>, ApiError> {
let body = hyper::body::aggregate(request.body_mut())
.await .await
.context("Failed to read request body") .context("Failed to read request body")
.map_err(ApiError::BadRequest)?; .map_err(ApiError::BadRequest)?;
if body.remaining() == 0 { serde_json::from_reader(whole_body.reader())
return Ok(None);
}
serde_json::from_reader(body.reader())
.context("Failed to parse json request") .context("Failed to parse json request")
.map(Some)
.map_err(ApiError::BadRequest) .map_err(ApiError::BadRequest)
} }

View File

@@ -1,7 +1,5 @@
use std::ffi::OsStr;
use std::{fmt, str::FromStr}; use std::{fmt, str::FromStr};
use anyhow::Context;
use hex::FromHex; use hex::FromHex;
use rand::Rng; use rand::Rng;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);
id_newtype!(TimelineId); id_newtype!(TimelineId);
impl TryFrom<Option<&OsStr>> for TimelineId {
type Error = anyhow::Error;
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
value
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| format!("Could not parse timeline id from {:?}", value))
}
}
/// Neon Tenant Id represents identifiar of a particular tenant. /// Neon Tenant Id represents identifiar of a particular tenant.
/// Is used for distinguishing requests and data belonging to different users. /// Is used for distinguishing requests and data belonging to different users.
/// ///

View File

@@ -60,12 +60,6 @@ pub mod tracing_span_assert;
pub mod rate_limit; pub mod rate_limit;
/// Simple once-barrier and a guard which keeps barrier awaiting.
pub mod completion;
/// Reporting utilities
pub mod error;
mod failpoint_macro_helpers { mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)") /// use with fail::cfg("$name", "return(2000)")
@@ -112,16 +106,10 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
/// * building in docker (either in CI or locally) /// * building in docker (either in CI or locally)
/// ///
/// One thing to note is that .git is not available in docker (and it is bad to include it there). /// One thing to note is that .git is not available in docker (and it is bad to include it there).
/// When building locally, the `git_version` is used to query .git. When building on CI and docker, /// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to /// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION /// Git version received from environment variable used as a fallback in git_version invocation.
/// environment variable. /// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
///
/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
/// anytime) in #4641.
///
/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
/// So the build script will be run only when GIT_VERSION envvar has changed. /// So the build script will be run only when GIT_VERSION envvar has changed.
/// ///
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate? /// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
@@ -133,36 +121,25 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
/// ///
/// ############################################################################################# /// #############################################################################################
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details. /// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036> /// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
/// The problem needs further investigation and regular `const` declaration instead of a macro. /// The problem needs further investigation and regular `const` declaration instead of a macro.
#[macro_export] #[macro_export]
macro_rules! project_git_version { macro_rules! project_git_version {
($const_identifier:ident) => { ($const_identifier:ident) => {
// this should try GIT_VERSION first only then git_version::git_version! const $const_identifier: &str = git_version::git_version!(
const $const_identifier: &::core::primitive::str = { prefix = "git:",
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! { fallback = concat!(
prefix = "", "git-env:",
fallback = "unknown", env!("GIT_VERSION", "Missing GIT_VERSION envvar")
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha ),
}; args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
);
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
::core::option::Option::Some(x) => ["git-env:", x],
::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
};
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
};
}; };
} }
/// Re-export for `project_git_version` macro
#[doc(hidden)]
pub use const_format as __const_format;
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. /// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
#[macro_export] #[macro_export]
macro_rules! const_assert { macro_rules! const_assert {

View File

@@ -1,10 +1,9 @@
//! A module to create and read lock files. //! A module to create and read lock files.
//! //!
//! File locking is done using [`fcntl::flock`] exclusive locks. //! File locking is done using [`fcntl::flock`] exclusive locks.
//! The only consumer of this module is currently //! The only consumer of this module is currently [`pid_file`].
//! [`pid_file`](crate::pid_file). See the module-level comment //! See the module-level comment there for potential pitfalls
//! there for potential pitfalls with lock files that are used //! with lock files that are used to store PIDs (pidfiles).
//! to store PIDs (pidfiles).
use std::{ use std::{
fs, fs,
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
} }
/// Returned by [`read_and_hold_lock_file`]. /// Returned by [`read_and_hold_lock_file`].
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean /// Check out the [`pid_file`] module for what the variants mean
/// and potential caveats if the lock files that are used to store PIDs. /// and potential caveats if the lock files that are used to store PIDs.
pub enum LockFileRead { pub enum LockFileRead {
/// No file exists at the given path. /// No file exists at the given path.

View File

@@ -84,7 +84,7 @@ pub fn init(
let r = r.with({ let r = r.with({
let log_layer = tracing_subscriber::fmt::layer() let log_layer = tracing_subscriber::fmt::layer()
.with_target(false) .with_target(false)
.with_ansi(false) .with_ansi(atty::is(atty::Stream::Stdout))
.with_writer(std::io::stdout); .with_writer(std::io::stdout);
let log_layer = match log_format { let log_layer = match log_format {
LogFormat::Json => log_layer.json().boxed(), LogFormat::Json => log_layer.json().boxed(),
@@ -112,7 +112,7 @@ pub fn init(
/// ///
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr). /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
/// If the assumptions about the initialization order are not held, use /// If the assumptions about the initialization order are not held, use
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be /// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
/// lost. /// lost.
#[must_use] #[must_use]
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard { pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {

View File

@@ -1,5 +1,4 @@
use pin_project_lite::pin_project; use pin_project_lite::pin_project;
use std::io::Read;
use std::pin::Pin; use std::pin::Pin;
use std::{io, task}; use std::{io, task};
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
@@ -76,34 +75,3 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
self.project().stream.poll_shutdown(context) self.project().stream.poll_shutdown(context)
} }
} }
/// Wrapper for a reader that counts bytes read.
///
/// Similar to MeasuredStream but it's one way and it's sync
pub struct MeasuredReader<R: Read> {
inner: R,
byte_count: usize,
}
impl<R: Read> MeasuredReader<R> {
pub fn new(reader: R) -> Self {
Self {
inner: reader,
byte_count: 0,
}
}
pub fn get_byte_count(&self) -> usize {
self.byte_count
}
}
impl<R: Read> Read for MeasuredReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let result = self.inner.read(buf);
if let Ok(n_bytes) = result {
self.byte_count += n_bytes
}
result
}
}

View File

@@ -23,9 +23,9 @@ pub enum SeqWaitError {
/// Monotonically increasing value /// Monotonically increasing value
/// ///
/// It is handy to store some other fields under the same mutex in `SeqWait<S>` /// It is handy to store some other fields under the same mutex in SeqWait<S>
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
/// any type that can expose counter. `V` is the type of exposed counter. /// any type that can expose counter. <V> is the type of exposed counter.
pub trait MonotonicCounter<V> { pub trait MonotonicCounter<V> {
/// Bump counter value and check that it goes forward /// Bump counter value and check that it goes forward
/// N.B.: new_val is an actual new value, not a difference. /// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
/// [`wait_for`]: SeqWait::wait_for /// [`wait_for`]: SeqWait::wait_for
/// [`advance`]: SeqWait::advance /// [`advance`]: SeqWait::advance
/// ///
/// `S` means Storage, `V` is type of counter that this storage exposes. /// <S> means Storage, <V> is type of counter that this storage exposes.
/// ///
pub struct SeqWait<S, V> pub struct SeqWait<S, V>
where where
@@ -144,8 +144,6 @@ where
/// ///
/// This call won't complete until someone has called `advance` /// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for. /// with a number greater than or equal to the one we're waiting for.
///
/// This function is async cancellation-safe.
pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
match self.queue_for_wait(num) { match self.queue_for_wait(num) {
Ok(None) => Ok(()), Ok(None) => Ok(()),
@@ -161,8 +159,6 @@ where
/// ///
/// If that hasn't happened after the specified timeout duration, /// If that hasn't happened after the specified timeout duration,
/// [`SeqWaitError::Timeout`] will be returned. /// [`SeqWaitError::Timeout`] will be returned.
///
/// This function is async cancellation-safe.
pub async fn wait_for_timeout( pub async fn wait_for_timeout(
&self, &self,
num: V, num: V,

View File

@@ -1,15 +1,8 @@
//! Assert that the current [`tracing::Span`] has a given set of fields. //! Assert that the current [`tracing::Span`] has a given set of fields.
//! //!
//! Can only produce meaningful positive results when tracing has been configured as in example.
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
//!
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
//! is completly unconfigured.
//!
//! # Usage //! # Usage
//! //!
//! ```rust //! ```
//! # fn main() {
//! use tracing_subscriber::prelude::*; //! use tracing_subscriber::prelude::*;
//! let registry = tracing_subscriber::registry() //! let registry = tracing_subscriber::registry()
//! .with(tracing_error::ErrorLayer::default()); //! .with(tracing_error::ErrorLayer::default());
@@ -27,18 +20,23 @@
//! //!
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
//! if let Err(missing) = check_fields_present!([&extractor]) { //! match check_fields_present([&extractor]) {
//! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! Ok(()) => {},
//! // to get the "user" code location for the panic. //! Err(missing) => {
//! panic!("Missing fields: {missing:?}"); //! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
//! }
//! } //! }
//! # }
//! ``` //! ```
//! //!
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering> //! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
//! //!
#[derive(Debug)] use std::{
collections::HashSet,
fmt::{self},
hash::{Hash, Hasher},
};
pub enum ExtractionResult { pub enum ExtractionResult {
Present, Present,
Absent, Absent,
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
} }
} }
/// Checks that the given extractors are satisfied with the current span hierarchy. struct MemoryIdentity<'a>(&'a dyn Extractor);
///
/// This should not be called directly, but used through [`check_fields_present`] which allows
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
#[doc(hidden)]
pub fn check_fields_present0<const L: usize>(
must_be_present: [&dyn Extractor; L],
) -> Result<Summary, Vec<&dyn Extractor>> {
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
let trace = tracing_error::SpanTrace::capture();
trace.with_spans(|md, _formatted_fields| {
// when trying to understand the inner workings of how does the matching work, note that
// this closure might be called zero times if the span is disabled. normally it is called
// once per span hierarchy level.
missing.retain(|extractor| match extractor.extract(md.fields()) {
ExtractionResult::Present => false,
ExtractionResult::Absent => true,
});
// continue walking up until we've found all missing impl<'a> MemoryIdentity<'a> {
!missing.is_empty() fn as_ptr(&self) -> *const () {
}); self.0 as *const _ as *const ()
if missing.is_empty() { }
Ok(Summary::FoundEverything) }
} else if !tracing_subscriber_configured() { impl<'a> PartialEq for MemoryIdentity<'a> {
Ok(Summary::Unconfigured) fn eq(&self, other: &Self) -> bool {
} else { self.as_ptr() == other.as_ptr()
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is }
// missing, which can be annoying. for this case, we could probably use }
// SpanTrace::status(). impl<'a> Eq for MemoryIdentity<'a> {}
// impl<'a> Hash for MemoryIdentity<'a> {
// another way to end up here is with RUST_LOG=pageserver=off while configuring the fn hash<H: Hasher>(&self, state: &mut H) {
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid. self.as_ptr().hash(state);
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`. }
Err(missing) }
impl<'a> fmt::Debug for MemoryIdentity<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
} }
} }
/// Checks that the given extractors are satisfied with the current span hierarchy. /// The extractor names passed as keys to [`new`].
/// pub fn check_fields_present<const L: usize>(
/// The macro is the preferred way of checking if fields exist while passing checks if a test does must_be_present: [&dyn Extractor; L],
/// not have tracing configured. ) -> Result<(), Vec<&dyn Extractor>> {
/// let mut missing: HashSet<MemoryIdentity> =
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present. HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
/// However we can game a module namespaced macro for `use` purposes by re-exporting the let trace = tracing_error::SpanTrace::capture();
/// #[macro_export] exported name with an alias (below). trace.with_spans(|md, _formatted_fields| {
#[doc(hidden)] missing.retain(|extractor| match extractor.0.extract(md.fields()) {
#[macro_export] ExtractionResult::Present => false,
macro_rules! __check_fields_present { ExtractionResult::Absent => true,
($extractors:expr) => {{ });
{ !missing.is_empty() // continue walking up until we've found all missing
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
match check_fields_present0($extractors) {
Ok(FoundEverything) => Ok(()),
Ok(Unconfigured) if cfg!(test) => {
// allow unconfigured in tests
Ok(())
},
Ok(Unconfigured) => {
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
},
Err(missing) => Err(missing)
}
}
}}
}
pub use crate::__check_fields_present as check_fields_present;
/// Explanation for why the check was deemed ok.
///
/// Mainly useful for testing, or configuring per-crate behaviour as in with
/// [`check_fields_present`].
#[derive(Debug)]
pub enum Summary {
/// All extractors were found.
///
/// Should only happen when tracing is properly configured.
FoundEverything,
/// Tracing has not been configured at all. This is ok for tests running without tracing set
/// up.
Unconfigured,
}
fn tracing_subscriber_configured() -> bool {
let mut noop_configured = false;
tracing::dispatcher::get_default(|d| {
// it is possible that this closure will not be invoked, but the current implementation
// always invokes it
noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
}); });
if missing.is_empty() {
!noop_configured Ok(())
} else {
Err(missing.into_iter().map(|mi| mi.0).collect())
}
} }
#[cfg(test)] #[cfg(test)]
@@ -177,36 +123,6 @@ mod tests {
use super::*; use super::*;
use std::{
collections::HashSet,
fmt::{self},
hash::{Hash, Hasher},
};
struct MemoryIdentity<'a>(&'a dyn Extractor);
impl<'a> MemoryIdentity<'a> {
fn as_ptr(&self) -> *const () {
self.0 as *const _ as *const ()
}
}
impl<'a> PartialEq for MemoryIdentity<'a> {
fn eq(&self, other: &Self) -> bool {
self.as_ptr() == other.as_ptr()
}
}
impl<'a> Eq for MemoryIdentity<'a> {}
impl<'a> Hash for MemoryIdentity<'a> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ptr().hash(state);
}
}
impl<'a> fmt::Debug for MemoryIdentity<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
}
}
struct Setup { struct Setup {
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
tenant_extractor: MultiNameExtractor<2>, tenant_extractor: MultiNameExtractor<2>,
@@ -243,8 +159,7 @@ mod tests {
let setup = setup_current_thread(); let setup = setup_current_thread();
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
} }
#[test] #[test]
@@ -252,8 +167,8 @@ mod tests {
let setup = setup_current_thread(); let setup = setup_current_thread();
let span = tracing::info_span!("root", timeline_id = "timeline-1"); let span = tracing::info_span!("root", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]) let missing =
.unwrap_err(); check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]); assert_missing(missing, vec![&setup.tenant_extractor]);
} }
@@ -270,8 +185,7 @@ mod tests {
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
} }
#[test] #[test]
@@ -284,7 +198,7 @@ mod tests {
let span = tracing::info_span!("child", timeline_id = "timeline-1"); let span = tracing::info_span!("child", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]); assert_missing(missing, vec![&setup.tenant_extractor]);
} }
@@ -293,8 +207,7 @@ mod tests {
let setup = setup_current_thread(); let setup = setup_current_thread();
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor]); check_fields_present([&setup.tenant_extractor]).unwrap();
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
} }
#[test] #[test]
@@ -310,8 +223,7 @@ mod tests {
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor]); check_fields_present([&setup.tenant_extractor]).unwrap();
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
} }
#[test] #[test]
@@ -319,7 +231,7 @@ mod tests {
let setup = setup_current_thread(); let setup = setup_current_thread();
let span = tracing::info_span!("root", timeline_id = "timeline-1"); let span = tracing::info_span!("root", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]); assert_missing(missing, vec![&setup.tenant_extractor]);
} }
@@ -333,107 +245,43 @@ mod tests {
let span = tracing::info_span!("child", timeline_id = "timeline-1"); let span = tracing::info_span!("child", timeline_id = "timeline-1");
let _guard = span.enter(); let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]); assert_missing(missing, vec![&setup.tenant_extractor]);
} }
#[test] #[test]
fn tracing_error_subscriber_not_set_up_straight_line() { fn tracing_error_subscriber_not_set_up() {
// no setup // no setup
let span = tracing::info_span!("foo", e = "some value"); let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter(); let _guard = span.enter();
let extractor = MultiNameExtractor::new("E", ["e"]); let extractor = MultiNameExtractor::new("E", ["e"]);
let res = check_fields_present0([&extractor]); let missing = check_fields_present([&extractor]).unwrap_err();
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); assert_missing(missing, vec![&extractor]);
// similarly for a not found key
let extractor = MultiNameExtractor::new("F", ["foobar"]);
let res = check_fields_present0([&extractor]);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
} }
#[test] #[test]
fn tracing_error_subscriber_not_set_up_with_instrument() { #[should_panic]
// no setup fn panics_if_tracing_error_subscriber_has_wrong_filter() {
// demo a case where span entering is used to establish a parent child connection, but
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter();
let subspan = tracing::info_span!("bar", f = "foobar");
drop(_guard);
// normally this would work, but without any tracing-subscriber configured, both
// check_field_present find nothing
let _guard = subspan.enter();
let extractors: [&dyn Extractor; 2] = [
&MultiNameExtractor::new("E", ["e"]),
&MultiNameExtractor::new("F", ["f"]),
];
let res = check_fields_present0(extractors);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
// similarly for a not found key
let extractor = MultiNameExtractor::new("G", ["g"]);
let res = check_fields_present0([&extractor]);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
}
#[test]
fn tracing_subscriber_configured() {
// this will fail if any utils::logging::init callers appear, but let's hope they do not
// appear.
assert!(!super::tracing_subscriber_configured());
let _g = setup_current_thread();
assert!(super::tracing_subscriber_configured());
}
#[test]
fn not_found_when_disabled_by_filter() {
let r = tracing_subscriber::registry().with({ let r = tracing_subscriber::registry().with({
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn( tracing_error::ErrorLayer::default().with_filter(
|md| !(md.is_span() && *md.level() == tracing::Level::INFO), tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
)) if md.is_span() && *md.level() == tracing::Level::INFO {
return false;
}
true
}),
)
}); });
let _guard = tracing::subscriber::set_default(r); let _guard = tracing::subscriber::set_default(r);
// this test is a rather tricky one, it has a number of possible outcomes depending on the
// execution order when executed with other tests even if no test sets the global default
// subscriber.
let span = tracing::info_span!("foo", e = "some value"); let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter(); let _guard = span.enter();
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; let extractor = MultiNameExtractor::new("E", ["e"]);
let missing = check_fields_present([&extractor]).unwrap_err();
if span.is_disabled() { assert_missing(missing, vec![&extractor]);
// the tests are running single threaded, or we got lucky and no other tests subscriber
// was got to register their per-CALLSITE::META interest between `set_default` and
// creation of the span, thus the filter got to apply and registered interest of Never,
// so the span was never created.
//
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
// to find an error.
let missing = check_fields_present0(extractors).unwrap_err();
assert_missing(missing, vec![extractors[0]]);
} else {
// when the span is enabled, it is because some other test is running at the same time,
// and that tests registry has filters which are interested in our above span.
//
// because the span is now enabled, all keys will be found for it. the
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
// this test-induced issue.
let res = check_fields_present0(extractors);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
}
} }
} }

View File

@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]
[dependencies] [dependencies]
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true async-stream.workspace = true
async-trait.workspace = true async-trait.workspace = true
byteorder.workspace = true byteorder.workspace = true
@@ -25,7 +24,6 @@ consumption_metrics.workspace = true
crc32c.workspace = true crc32c.workspace = true
crossbeam-utils.workspace = true crossbeam-utils.workspace = true
either.workspace = true either.workspace = true
flate2.workspace = true
fail.workspace = true fail.workspace = true
futures.workspace = true futures.workspace = true
git-version.workspace = true git-version.workspace = true
@@ -35,8 +33,6 @@ humantime-serde.workspace = true
hyper.workspace = true hyper.workspace = true
itertools.workspace = true itertools.workspace = true
nix.workspace = true nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }
num-traits.workspace = true num-traits.workspace = true
once_cell.workspace = true once_cell.workspace = true
pin-project-lite.workspace = true pin-project-lite.workspace = true
@@ -84,7 +80,6 @@ strum_macros.workspace = true
criterion.workspace = true criterion.workspace = true
hex-literal.workspace = true hex-literal.workspace = true
tempfile.workspace = true tempfile.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
[[bench]] [[bench]]
name = "bench_layer_map" name = "bench_layer_map"

View File

@@ -1,23 +1,22 @@
use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key; use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min}; use std::cmp::{max, min};
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
use criterion::{black_box, criterion_group, criterion_main, Criterion}; use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn build_layer_map(filename_dump: PathBuf) -> LayerMap { fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
let mut layer_map = LayerMap::default(); let mut layer_map = LayerMap::<LayerDescriptor>::default();
let mut min_lsn = Lsn(u64::MAX); let mut min_lsn = Lsn(u64::MAX);
let mut max_lsn = Lsn(0); let mut max_lsn = Lsn(0);
@@ -28,13 +27,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
for fname in filenames { for fname in filenames {
let fname = fname.unwrap(); let fname = fname.unwrap();
let fname = LayerFileName::from_str(&fname).unwrap(); let fname = LayerFileName::from_str(&fname).unwrap();
let layer = PersistentLayerDesc::from(fname); let layer = LayerDescriptor::from(fname);
let lsn_range = layer.get_lsn_range(); let lsn_range = layer.get_lsn_range();
min_lsn = min(min_lsn, lsn_range.start); min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
updates.insert_historic(layer); updates.insert_historic(Arc::new(layer));
} }
println!("min: {min_lsn}, max: {max_lsn}"); println!("min: {min_lsn}, max: {max_lsn}");
@@ -44,7 +43,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
} }
/// Construct a layer map query pattern for benchmarks /// Construct a layer map query pattern for benchmarks
fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
// For each image layer we query one of the pages contained, at LSN right // For each image layer we query one of the pages contained, at LSN right
// before the image layer was created. This gives us a somewhat uniform // before the image layer was created. This gives us a somewhat uniform
// coverage of both the lsn and key space because image layers have // coverage of both the lsn and key space because image layers have
@@ -70,7 +69,7 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
// Construct a partitioning for testing get_difficulty map when we // Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with. // don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
let mut parts = Vec::new(); let mut parts = Vec::new();
// We add a partition boundary at the start of each image layer, // We add a partition boundary at the start of each image layer,
@@ -210,15 +209,13 @@ fn bench_sequential(c: &mut Criterion) {
for i in 0..100_000 { for i in 0..100_000 {
let i32 = (i as u32) % 100; let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = PersistentLayerDesc::new_img( let layer = LayerDescriptor {
TenantId::generate(), key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
TimelineId::generate(), lsn: Lsn(i)..Lsn(i + 1),
zero.add(10 * i32)..zero.add(10 * i32 + 1), is_incremental: false,
Lsn(i), short_id: format!("Layer {}", i),
false, };
0, updates.insert_historic(Arc::new(layer));
);
updates.insert_historic(layer);
} }
updates.flush(); updates.flush();
println!("Finished layer map init in {:?}", now.elapsed()); println!("Finished layer map init in {:?}", now.elapsed());

View File

@@ -1,19 +0,0 @@
[package]
name = "pagectl"
version = "0.1.0"
edition.workspace = true
license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow.workspace = true
bytes.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
pageserver = { path = ".." }
postgres_ffi.workspace = true
tokio.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true

View File

@@ -1,168 +0,0 @@
use std::path::{Path, PathBuf};
use anyhow::Result;
use clap::Subcommand;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::{page_cache, virtual_file};
use pageserver::{
repository::{Key, KEY_SIZE},
tenant::{
block_io::FileBlockReader, disk_btree::VisitDirection,
storage_layer::delta_layer::DELTA_KEY_SIZE,
},
virtual_file::VirtualFile,
};
use std::fs;
use utils::bin_ser::BeSer;
use crate::layer_map_analyzer::parse_filename;
#[derive(Subcommand)]
pub(crate) enum LayerCmd {
/// List all tenants and timelines under the pageserver path
///
/// Example: `cargo run --bin pagectl layer list .neon/`
List { path: PathBuf },
/// List all layers of a given tenant and timeline
///
/// Example: `cargo run --bin pagectl layer list .neon/`
ListLayer {
path: PathBuf,
tenant: String,
timeline: String,
},
/// Dump all information of a layer file
DumpLayer {
path: PathBuf,
tenant: String,
timeline: String,
/// The id from list-layer command
id: usize,
},
}
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
use pageserver::tenant::block_io::BlockReader;
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
actual_summary.index_root_blk,
&file,
);
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
let mut all = vec![];
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value_offset| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
all.push((curr, BlobRef(value_offset)));
true
},
)?;
let mut cursor = BlockCursor::new(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos())?;
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
Ok(())
}
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join("tenants"))? {
let tenant = tenant?;
if !tenant.file_type()?.is_dir() {
continue;
}
println!("tenant {}", tenant.file_name().to_string_lossy());
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
let timeline = timeline?;
if !timeline.file_type()?.is_dir() {
continue;
}
println!("- timeline {}", timeline.file_name().to_string_lossy());
}
}
}
LayerCmd::ListLayer {
path,
tenant,
timeline,
} => {
let timeline_path = path
.join("tenants")
.join(tenant)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {
let layer = layer?;
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
{
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
idx += 1;
}
}
}
LayerCmd::DumpLayer {
path,
tenant,
timeline,
id,
} => {
let timeline_path = path
.join("tenants")
.join(tenant)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {
let layer = layer?;
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
{
if *id == idx {
// TODO(chi): dedup code
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
if layer_file.is_delta {
read_delta_file(layer.path()).await?;
} else {
anyhow::bail!("not supported yet :(");
}
break;
}
idx += 1;
}
}
}
}
Ok(())
}

View File

@@ -1,180 +0,0 @@
//! A helper tool to manage pageserver binary files.
//! Accepts a file as an argument, attempts to parse it with all ways possible
//! and prints its interpreted context.
//!
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
mod draw_timeline_dir;
mod layer_map_analyzer;
mod layers;
use clap::{Parser, Subcommand};
use layers::LayerCmd;
use pageserver::{
context::{DownloadBehavior, RequestContext},
page_cache,
task_mgr::TaskKind,
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use postgres_ffi::ControlFileData;
use std::path::{Path, PathBuf};
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
#[derive(Parser)]
#[command(
version = GIT_VERSION,
about = "Neon Pageserver binutils",
long_about = "Reads pageserver (and related) binary files management utility"
)]
#[command(propagate_version = true)]
struct CliOpts {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Metadata(MetadataCmd),
PrintLayerFile(PrintLayerFileCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
Layer(LayerCmd),
}
/// Read and update pageserver metadata file
#[derive(Parser)]
struct MetadataCmd {
/// Input metadata file path
metadata_path: PathBuf,
/// Replace disk consistent Lsn
disk_consistent_lsn: Option<Lsn>,
/// Replace previous record Lsn
prev_record_lsn: Option<Lsn>,
/// Replace latest gc cuttoff
latest_gc_cuttoff: Option<Lsn>,
}
#[derive(Parser)]
struct PrintLayerFileCmd {
/// Pageserver data path
path: PathBuf,
}
#[derive(Parser)]
struct AnalyzeLayerMapCmd {
/// Pageserver data path
path: PathBuf,
/// Max holes
max_holes: Option<usize>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = CliOpts::parse();
match cli.command {
Commands::Layer(cmd) => {
layers::main(&cmd).await?;
}
Commands::Metadata(cmd) => {
handle_metadata(&cmd)?;
}
Commands::DrawTimeline {} => {
draw_timeline_dir::main()?;
}
Commands::AnalyzeLayerMap(cmd) => {
layer_map_analyzer::main(&cmd).await?;
}
Commands::PrintLayerFile(cmd) => {
if let Err(e) = read_pg_control_file(&cmd.path) {
println!(
"Failed to read input file as a pg control one: {e:#}\n\
Attempting to read it as layer file"
);
print_layerfile(&cmd.path).await?;
}
}
};
Ok(())
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
"pg_initdb_lsn: {}, aligned: {}",
control_file_initdb,
control_file_initdb.align()
);
Ok(())
}
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await
}
fn handle_metadata(
MetadataCmd {
metadata_path: path,
disk_consistent_lsn,
prev_record_lsn,
latest_gc_cuttoff,
}: &MetadataCmd,
) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
meta = TimelineMetadata::new(
*disk_consistent_lsn,
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(prev_record_lsn) = prev_record_lsn {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
Some(*prev_record_lsn),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
*latest_gc_cuttoff,
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())
}

View File

@@ -19,6 +19,12 @@ use tokio::io;
use tokio::io::AsyncWrite; use tokio::io::AsyncWrite;
use tracing::*; use tracing::*;
/// NB: This relies on a modified version of tokio_tar that does *not* write the
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
/// without explicitly calling 'finish' or 'into_inner'!
///
/// See https://github.com/neondatabase/tokio-tar/pull/1
///
use tokio_tar::{Builder, EntryType, Header}; use tokio_tar::{Builder, EntryType, Header};
use crate::context::RequestContext; use crate::context::RequestContext;

View File

@@ -7,12 +7,12 @@
//! - The y axis represents LSN, growing upwards. //! - The y axis represents LSN, growing upwards.
//! //!
//! Coordinates in both axis are compressed for better readability. //! Coordinates in both axis are compressed for better readability.
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>) //! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
//! //!
//! Example use: //! Example use:
//! ```bash //! ```
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg //! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ firefox out.svg //! $ firefox out.svg
//! ``` //! ```
//! //!
@@ -20,7 +20,7 @@
//! or from pageserver log files. //! or from pageserver log files.
//! //!
//! TODO Consider shipping this as a grafana panel plugin: //! TODO Consider shipping this as a grafana panel plugin:
//! <https://grafana.com/tutorials/build-a-panel-plugin/> //! https://grafana.com/tutorials/build-a-panel-plugin/
use anyhow::Result; use anyhow::Result;
use pageserver::repository::Key; use pageserver::repository::Key;
use std::cmp::Ordering; use std::cmp::Ordering;
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
(keys, lsns) (keys, lsns)
} }
pub fn main() -> Result<()> { fn main() -> Result<()> {
// Parse layer filenames from stdin // Parse layer filenames from stdin
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![]; let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
let stdin = io::stdin(); let stdin = io::stdin();
@@ -117,8 +117,7 @@ pub fn main() -> Result<()> {
let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut lsn_diff = (lsn_end - lsn_start) as f32;
let mut fill = Fill::None; let mut fill = Fill::None;
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
let mut lsn_offset = 0.0; let mut lsn_offset = 0.0;
// Fill in and thicken rectangle if it's an // Fill in and thicken rectangle if it's an
@@ -129,7 +128,7 @@ pub fn main() -> Result<()> {
num_images += 1; num_images += 1;
lsn_diff = 0.3; lsn_diff = 0.3;
lsn_offset = -lsn_diff / 2.0; lsn_offset = -lsn_diff / 2.0;
ymargin = 0.05; margin = 0.05;
fill = Fill::Color(rgb(0, 0, 0)); fill = Fill::Color(rgb(0, 0, 0));
} }
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
@@ -138,10 +137,10 @@ pub fn main() -> Result<()> {
println!( println!(
" {}", " {}",
rectangle( rectangle(
key_start as f32 + stretch * xmargin, key_start as f32 + stretch * margin,
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
key_diff as f32 - stretch * 2.0 * xmargin, key_diff as f32 - stretch * 2.0 * margin,
stretch * (lsn_diff - 2.0 * ymargin) stretch * (lsn_diff - 2.0 * margin)
) )
.fill(fill) .fill(fill)
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))

View File

@@ -6,7 +6,7 @@ use anyhow::Result;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::ops::Range; use std::ops::Range;
use std::{fs, path::Path, str}; use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
use pageserver::page_cache::PAGE_SZ; use pageserver::page_cache::PAGE_SZ;
use pageserver::repository::{Key, KEY_SIZE}; use pageserver::repository::{Key, KEY_SIZE};
@@ -18,14 +18,12 @@ use pageserver::virtual_file::VirtualFile;
use utils::{bin_ser::BeSer, lsn::Lsn}; use utils::{bin_ser::BeSer, lsn::Lsn};
use crate::AnalyzeLayerMapCmd;
const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
const DEFAULT_MAX_HOLES: usize = 10; const DEFAULT_MAX_HOLES: usize = 10;
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(PartialEq, Eq)] #[derive(PartialEq, Eq)]
pub struct Hole(Range<Key>); struct Hole(Range<Key>);
impl Ord for Hole { impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
@@ -41,11 +39,11 @@ impl PartialOrd for Hole {
} }
} }
pub(crate) struct LayerFile { struct LayerFile {
pub key_range: Range<Key>, key_range: Range<Key>,
pub lsn_range: Range<Lsn>, lsn_range: Range<Lsn>,
pub is_delta: bool, is_delta: bool,
pub holes: Vec<Hole>, holes: Vec<Hole>,
} }
impl LayerFile { impl LayerFile {
@@ -69,7 +67,7 @@ impl LayerFile {
} }
} }
pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> { fn parse_filename(name: &str) -> Option<LayerFile> {
let split: Vec<&str> = name.split("__").collect(); let split: Vec<&str> = name.split("__").collect();
if split.len() != 2 { if split.len() != 2 {
return None; return None;
@@ -95,7 +93,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
} }
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> { fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path)?); let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0)?; let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,9 +127,18 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
Ok(holes) Ok(holes)
} }
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { fn main() -> Result<()> {
let storage_path = &cmd.path; let args: Vec<String> = env::args().collect();
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); if args.len() < 2 {
println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
return Ok(());
}
let storage_path = PathBuf::from_str(&args[1])?;
let max_holes = if args.len() > 2 {
args[2].parse::<usize>().unwrap()
} else {
DEFAULT_MAX_HOLES
};
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10); pageserver::virtual_file::init(10);
@@ -160,7 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
parse_filename(&layer.file_name().into_string().unwrap()) parse_filename(&layer.file_name().into_string().unwrap())
{ {
if layer_file.is_delta { if layer_file.is_delta {
layer_file.holes = get_holes(&layer.path(), max_holes).await?; layer_file.holes = get_holes(&layer.path(), max_holes)?;
n_deltas += 1; n_deltas += 1;
} }
layers.push(layer_file); layers.push(layer_file);

View File

@@ -9,7 +9,6 @@ use clap::{Arg, ArgAction, Command};
use fail::FailScenario; use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage; use remote_storage::GenericRemoteStorage;
use tracing::*; use tracing::*;
@@ -19,7 +18,9 @@ use pageserver::{
context::{DownloadBehavior, RequestContext}, context::{DownloadBehavior, RequestContext},
http, page_cache, page_service, task_mgr, http, page_cache, page_service, task_mgr,
task_mgr::TaskKind, task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant::mgr, tenant::mgr,
virtual_file, virtual_file,
}; };
@@ -275,18 +276,7 @@ fn start_pageserver(
let pageserver_listener = tcp_listener::bind(pg_addr)?; let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Launch broker client // Launch broker client
// The storage_broker::connect call needs to happen inside a tokio runtime thread. WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
let broker_client = WALRECEIVER_RUNTIME
.block_on(async {
// Note: we do not attempt connecting here (but validate endpoints sanity).
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
})
.with_context(|| {
format!(
"create broker client for uri={:?} keepalive_interval={:?}",
&conf.broker_endpoint, conf.broker_keepalive_interval,
)
})?;
// Initialize authentication for incoming connections // Initialize authentication for incoming connections
let http_auth; let http_auth;
@@ -335,118 +325,8 @@ fn start_pageserver(
// Set up remote storage client // Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?; let remote_storage = create_remote_storage_client(conf)?;
// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
// BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
//
// init_done_rx will notify when all initial load operations have completed.
//
// background_jobs_can_start (same name used to hold off background jobs from starting at
// consumer side) will be dropped once we can start the background jobs. Currently it is behind
// completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
// (background_task_maximum_delay).
let (init_done_tx, init_done_rx) = utils::completion::channel();
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
let order = pageserver::InitializationOrder {
initial_tenant_load: Some(init_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: init_logical_size_done_tx,
background_jobs_can_start: background_jobs_barrier.clone(),
};
// Scan the local 'tenants/' directory and start loading the tenants // Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now(); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
broker_client.clone(),
remote_storage.clone(),
order,
))?;
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
init_done_rx.wait().await;
// initial logical sizes can now start, as they were waiting on init_done_rx.
scopeguard::ScopeGuard::into_inner(guard);
let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;
tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed"
);
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
let timeout = conf.background_task_maximum_delay;
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
Ok(_) => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
None
}
Err(_) => {
tracing::info!(
timeout_millis = timeout.as_millis(),
"Initial logical size timeout elapsed; starting background jobs"
);
Some(init_sizes_done)
}
};
scopeguard::ScopeGuard::into_inner(guard);
// allow background jobs to start
drop(background_jobs_can_start);
if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
// consumption metrics.
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
init_sizes_done.await;
scopeguard::ScopeGuard::into_inner(guard);
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
}
};
async move {
let mut drive_init = std::pin::pin!(drive_init);
// just race these tasks
tokio::select! {
_ = shutdown_pageserver.cancelled() => {},
_ = &mut drive_init => {},
}
}
});
// shared state between the disk-usage backed eviction background task and the http endpoint // shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -459,7 +339,6 @@ fn start_pageserver(
conf, conf,
remote_storage.clone(), remote_storage.clone(),
disk_usage_eviction_state.clone(), disk_usage_eviction_state.clone(),
background_jobs_barrier.clone(),
)?; )?;
} }
@@ -472,7 +351,6 @@ fn start_pageserver(
conf, conf,
launch_ts, launch_ts,
http_auth, http_auth,
broker_client.clone(),
remote_storage, remote_storage,
disk_usage_eviction_state, disk_usage_eviction_state,
)? )?
@@ -495,50 +373,37 @@ fn start_pageserver(
Ok(()) Ok(())
}, },
); );
}
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let background_jobs_barrier = background_jobs_barrier; let metrics_ctx = RequestContext::todo_child(
let metrics_ctx = RequestContext::todo_child( TaskKind::MetricsCollection,
TaskKind::MetricsCollection, // This task itself shouldn't download anything.
// This task itself shouldn't download anything. // The actual size calculation does need downloads, and
// The actual size calculation does need downloads, and // creates a child context with the right DownloadBehavior.
// creates a child context with the right DownloadBehavior. DownloadBehavior::Error,
DownloadBehavior::Error, );
); task_mgr::spawn(
task_mgr::spawn( MGMT_REQUEST_RUNTIME.handle(),
crate::BACKGROUND_RUNTIME.handle(), TaskKind::MetricsCollection,
TaskKind::MetricsCollection, None,
None, None,
None, "consumption metrics collection",
"consumption metrics collection", true,
true, async move {
async move { pageserver::consumption_metrics::collect_metrics(
// first wait until background jobs are cleared to launch. metric_collection_endpoint,
// conf.metric_collection_interval,
// this is because we only process active tenants and timelines, and the conf.cached_metric_collection_interval,
// Timeline::get_current_logical_size will spawn the logical size calculation, conf.synthetic_size_calculation_interval,
// which will not be rate-limited. conf.id,
let cancel = task_mgr::shutdown_token(); metrics_ctx,
)
tokio::select! { .instrument(info_span!("metrics_collection"))
_ = cancel.cancelled() => { return Ok(()); }, .await?;
_ = background_jobs_barrier.wait() => {} Ok(())
}; },
);
pageserver::consumption_metrics::collect_metrics( }
metric_collection_endpoint,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
},
);
} }
// Spawn a task to listen for libpq connections. It will spawn further tasks // Spawn a task to listen for libpq connections. It will spawn further tasks
@@ -562,7 +427,6 @@ fn start_pageserver(
async move { async move {
page_service::libpq_listener_main( page_service::libpq_listener_main(
conf, conf,
broker_client,
pg_auth, pg_auth,
pageserver_listener, pageserver_listener,
conf.pg_auth_type, conf.pg_auth_type,
@@ -573,8 +437,6 @@ fn start_pageserver(
); );
} }
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal. // All started up! Now just sit and wait for shutdown signal.
ShutdownSignals::handle(|signal| match signal { ShutdownSignals::handle(|signal| match signal {
Signal::Quit => { Signal::Quit => {
@@ -590,11 +452,6 @@ fn start_pageserver(
"Got {}. Terminating gracefully in fast shutdown mode", "Got {}. Terminating gracefully in fast shutdown mode",
signal.name() signal.name()
); );
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!() unreachable!()
} }

View File

@@ -0,0 +1,157 @@
//! A helper tool to manage pageserver binary files.
//! Accepts a file as an argument, attempts to parse it with all ways possible
//! and prints its interpreted context.
//!
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
use std::{
path::{Path, PathBuf},
str::FromStr,
};
use anyhow::Context;
use clap::{value_parser, Arg, Command};
use pageserver::{
context::{DownloadBehavior, RequestContext},
page_cache,
task_mgr::TaskKind,
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use postgres_ffi::ControlFileData;
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
const METADATA_SUBCOMMAND: &str = "metadata";
fn main() -> anyhow::Result<()> {
let arg_matches = cli().get_matches();
match arg_matches.subcommand() {
Some((subcommand_name, subcommand_matches)) => {
let path = subcommand_matches
.get_one::<PathBuf>("metadata_path")
.context("'metadata_path' argument is missing")?
.to_path_buf();
anyhow::ensure!(
subcommand_name == METADATA_SUBCOMMAND,
"Unknown subcommand {subcommand_name}"
);
handle_metadata(&path, subcommand_matches)?;
}
None => {
let path = arg_matches
.get_one::<PathBuf>("path")
.context("'path' argument is missing")?
.to_path_buf();
println!(
"No subcommand specified, attempting to guess the format for file {}",
path.display()
);
if let Err(e) = read_pg_control_file(&path) {
println!(
"Failed to read input file as a pg control one: {e:#}\n\
Attempting to read it as layer file"
);
print_layerfile(&path)?;
}
}
};
Ok(())
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
"pg_initdb_lsn: {}, aligned: {}",
control_file_initdb,
control_file_initdb.align()
);
Ok(())
}
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx)
}
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
meta = TimelineMetadata::new(
Lsn::from_str(disk_consistent_lsn)?,
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
Some(Lsn::from_str(prev_record_lsn)?),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
);
update_meta = true;
}
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())
}
fn cli() -> Command {
Command::new("Neon Pageserver binutils")
.about("Reads pageserver (and related) binary files management utility")
.version(GIT_VERSION)
.arg(
Arg::new("path")
.help("Input file path")
.value_parser(value_parser!(PathBuf))
.required(false),
)
.subcommand(
Command::new(METADATA_SUBCOMMAND)
.about("Read and update pageserver metadata file")
.arg(
Arg::new("metadata_path")
.help("Input metadata file path")
.value_parser(value_parser!(PathBuf))
.required(false),
)
.arg(
Arg::new("disk_consistent_lsn")
.long("disk_consistent_lsn")
.help("Replace disk consistent Lsn"),
)
.arg(
Arg::new("prev_record_lsn")
.long("prev_record_lsn")
.help("Replace previous record Lsn"),
),
)
}
#[test]
fn verify_cli() {
cli().debug_assert();
}

View File

@@ -0,0 +1,48 @@
//! The broker client instance of the pageserver, created during pageserver startup.
//! Used by each timelines' [`walreceiver`].
use crate::config::PageServerConf;
use anyhow::Context;
use once_cell::sync::OnceCell;
use storage_broker::BrokerClientChannel;
use tracing::*;
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
///
/// Initialize the broker client. This must be called once at page server startup.
///
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
let broker_endpoint = conf.broker_endpoint.clone();
// Note: we do not attempt connecting here (but validate endpoints sanity).
let broker_client =
storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
format!(
"Failed to create broker client to {}",
&conf.broker_endpoint
),
)?;
if BROKER_CLIENT.set(broker_client).is_err() {
panic!("broker already initialized");
}
info!(
"Initialized broker client with endpoints: {}",
broker_endpoint
);
Ok(())
}
///
/// Get a handle to the broker client
///
pub fn get_broker_client() -> &'static BrokerClientChannel {
BROKER_CLIENT.get().expect("broker client not initialized")
}
pub fn is_broker_client_initialized() -> bool {
BROKER_CLIENT.get().is_some()
}

View File

@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt; use crate::tenant::config::TenantConfOpt;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::{ use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
TIMELINE_UNINIT_MARK_SUFFIX,
}; };
pub mod defaults { pub mod defaults {
@@ -64,7 +63,6 @@ pub mod defaults {
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
/// ///
/// Default built-in configuration file. /// Default built-in configuration file.
@@ -93,16 +91,15 @@ pub mod defaults {
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' # [tenant_config]
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} #compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
#gc_period = '{DEFAULT_GC_PERIOD}' #gc_period = '{DEFAULT_GC_PERIOD}'
#gc_horizon = {DEFAULT_GC_HORIZON} #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -111,9 +108,8 @@ pub mod defaults {
#min_resident_size_override = .. # in bytes #min_resident_size_override = .. # in bytes
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
[remote_storage] # [remote_storage]
"### "###
); );
@@ -172,13 +168,11 @@ pub struct PageServerConf {
pub log_format: LogFormat, pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
/// See the comment in `eviction_task` for details. /// See the comment in `eviction_task` for details.
///
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint. // How often to collect metrics and send them to the metrics endpoint.
@@ -193,15 +187,6 @@ pub struct PageServerConf {
pub test_remote_failures: u64, pub test_remote_failures: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool, pub ondemand_download_behavior_treat_error_as_warn: bool,
/// How long will background tasks be delayed at most after initial load of tenants.
///
/// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
/// as we now isolate initial loading, initial logical size calculation and background tasks.
/// Smaller nodes will have background tasks "not running" for this long unless every timeline
/// has it's initial logical size calculated. Not running background tasks for some seconds is
/// not terrible.
pub background_task_maximum_delay: Duration,
} }
/// We do not want to store this in a PageServerConf because the latter may be logged /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -274,8 +259,6 @@ struct PageServerConfigBuilder {
test_remote_failures: BuilderValue<u64>, test_remote_failures: BuilderValue<u64>,
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>, ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
background_task_maximum_delay: BuilderValue<Duration>,
} }
impl Default for PageServerConfigBuilder { impl Default for PageServerConfigBuilder {
@@ -333,11 +316,6 @@ impl Default for PageServerConfigBuilder {
test_remote_failures: Set(0), test_remote_failures: Set(0),
ondemand_download_behavior_treat_error_as_warn: Set(false), ondemand_download_behavior_treat_error_as_warn: Set(false),
background_task_maximum_delay: Set(humantime::parse_duration(
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
} }
} }
} }
@@ -462,10 +440,6 @@ impl PageServerConfigBuilder {
BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
} }
pub fn background_task_maximum_delay(&mut self, delay: Duration) {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn build(self) -> anyhow::Result<PageServerConf> { pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries .concurrent_tenant_size_logical_size_queries
@@ -548,9 +522,6 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!( .ok_or(anyhow!(
"missing ondemand_download_behavior_treat_error_as_warn" "missing ondemand_download_behavior_treat_error_as_warn"
))?, ))?,
background_task_maximum_delay: self
.background_task_maximum_delay
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
}) })
} }
} }
@@ -573,21 +544,21 @@ impl PageServerConf {
.join(TENANT_ATTACHING_MARKER_FILENAME) .join(TENANT_ATTACHING_MARKER_FILENAME)
} }
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf { pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
} }
/// Points to a place in pageserver's local directory, /// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located. /// where certain tenant's tenantconf file should be located.
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf { pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME) self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
} }
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
} }
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf { pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
self.timelines_path(tenant_id).join(timeline_id.to_string()) self.timelines_path(tenant_id).join(timeline_id.to_string())
} }
@@ -597,22 +568,11 @@ impl PageServerConf {
timeline_id: TimelineId, timeline_id: TimelineId,
) -> PathBuf { ) -> PathBuf {
path_with_suffix_extension( path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id), self.timeline_path(&timeline_id, &tenant_id),
TIMELINE_UNINIT_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
) )
} }
pub fn timeline_delete_mark_file_path(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn traces_path(&self) -> PathBuf { pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces") self.workdir.join("traces")
} }
@@ -631,8 +591,8 @@ impl PageServerConf {
/// Points to a place in pageserver's local directory, /// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located. /// where certain timeline's metadata file should be located.
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf { pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
self.timeline_path(tenant_id, timeline_id) self.timeline_path(&timeline_id, &tenant_id)
.join(METADATA_FILE_NAME) .join(METADATA_FILE_NAME)
} }
@@ -750,7 +710,6 @@ impl PageServerConf {
) )
}, },
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"), _ => bail!("unrecognized pageserver option '{key}'"),
} }
} }
@@ -838,8 +797,7 @@ impl PageServerConf {
)?); )?);
} }
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag = t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
} }
if let Some(trace_read_requests) = item.get("trace_read_requests") { if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests = t_conf.trace_read_requests =
@@ -869,14 +827,6 @@ impl PageServerConf {
)?); )?);
} }
if let Some(gc_feedback) = item.get("gc_feedback") {
t_conf.gc_feedback = Some(
gc_feedback
.as_bool()
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
);
}
Ok(t_conf) Ok(t_conf)
} }
@@ -918,7 +868,6 @@ impl PageServerConf {
disk_usage_based_eviction: None, disk_usage_based_eviction: None,
test_remote_failures: 0, test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false, ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
} }
} }
} }
@@ -1007,8 +956,6 @@ impl ConfigurableSemaphore {
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
/// behave like [`futures::future::pending`], just waiting until new permits are added. /// behave like [`futures::future::pending`], just waiting until new permits are added.
///
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub fn new(initial_permits: NonZeroUsize) -> Self { pub fn new(initial_permits: NonZeroUsize) -> Self {
ConfigurableSemaphore { ConfigurableSemaphore {
initial_permits, initial_permits,
@@ -1080,7 +1027,6 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
synthetic_size_calculation_interval = '333 s' synthetic_size_calculation_interval = '333 s'
log_format = 'json' log_format = 'json'
background_task_maximum_delay = '334 s'
"#; "#;
@@ -1139,9 +1085,6 @@ background_task_maximum_delay = '334 s'
disk_usage_based_eviction: None, disk_usage_based_eviction: None,
test_remote_failures: 0, test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false, ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
}, },
"Correct defaults should be used when no config values are provided" "Correct defaults should be used when no config values are provided"
); );
@@ -1196,7 +1139,6 @@ background_task_maximum_delay = '334 s'
disk_usage_based_eviction: None, disk_usage_based_eviction: None,
test_remote_failures: 0, test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false, ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
}, },
"Should be able to parse all basic config values correctly" "Should be able to parse all basic config values correctly"
); );

Some files were not shown because too many files have changed in this diff Show More