Merge branch 'main' into ps-trace
@@ -1,5 +1,6 @@
|
||||
*
|
||||
|
||||
!rust-toolchain.toml
|
||||
!Cargo.toml
|
||||
!Cargo.lock
|
||||
!Makefile
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/epic-template.md
vendored
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: Epic Template
|
||||
about: A set of related tasks contributing towards specific outcome, comprizing of
|
||||
about: A set of related tasks contributing towards specific outcome, comprising of
|
||||
more than 1 week of work.
|
||||
title: 'Epic: '
|
||||
labels: t/Epic
|
||||
|
||||
81
.github/actions/neon-project-create/action.yml
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
name: 'Create Neon Project'
|
||||
description: 'Create Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or stage'
|
||||
required: true
|
||||
region_id:
|
||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
||||
required: false
|
||||
outputs:
|
||||
dsn:
|
||||
description: 'Created Project DSN (for main database)'
|
||||
value: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
project_id:
|
||||
description: 'Created Project ID'
|
||||
value: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Parse Input
|
||||
id: parse-input
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
case "${ENVIRONMENT}" in
|
||||
dev)
|
||||
API_HOST=console.dev.neon.tech
|
||||
REGION_ID=${REGION_ID:-eu-west-1}
|
||||
;;
|
||||
staging)
|
||||
API_HOST=console.stage.neon.tech
|
||||
REGION_ID=${REGION_ID:-us-east-1}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=api_host::${API_HOST}"
|
||||
echo "::set-output name=region_id::${REGION_ID}"
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v1/projects" \
|
||||
--fail \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}" \
|
||||
--data "{
|
||||
\"project\": {
|
||||
\"platform_id\": \"aws\",
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"settings\": { }
|
||||
}
|
||||
}")
|
||||
|
||||
# Mask password
|
||||
echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')"
|
||||
|
||||
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
|
||||
echo "::add-mask::${dsn}"
|
||||
echo "::set-output name=dsn::${dsn}"
|
||||
|
||||
project_id=$(echo $project | jq --raw-output '.id')
|
||||
echo "::set-output name=project_id::${project_id}"
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
REGION_ID: ${{ steps.parse-input.outputs.region_id }}
|
||||
54
.github/actions/neon-project-delete/action.yml
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
name: 'Delete Neon Project'
|
||||
description: 'Delete Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or stage'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project to delete'
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Parse Input
|
||||
id: parse-input
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
case "${ENVIRONMENT}" in
|
||||
dev)
|
||||
API_HOST=console.dev.neon.tech
|
||||
;;
|
||||
staging)
|
||||
API_HOST=console.stage.neon.tech
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=api_host::${API_HOST}"
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed
|
||||
if [ -n "${PROJECT_ID}" ]; then
|
||||
curl -X "POST" \
|
||||
"https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \
|
||||
--fail \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
fi
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
@@ -5,9 +5,6 @@ inputs:
|
||||
build_type:
|
||||
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster'
|
||||
required: true
|
||||
rust_toolchain:
|
||||
description: 'Rust toolchain version to fetch the caches'
|
||||
required: false
|
||||
test_selection:
|
||||
description: 'A python test suite to run'
|
||||
required: true
|
||||
@@ -55,7 +52,7 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Checkout
|
||||
|
||||
1
.github/helm-values/neon-stress.proxy.yaml
vendored
@@ -1,6 +1,7 @@
|
||||
fullnameOverride: "neon-stress-proxy"
|
||||
|
||||
settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.dev.neon.tech/psql_session/"
|
||||
|
||||
|
||||
1
.github/helm-values/production.proxy.yaml
vendored
@@ -1,4 +1,5 @@
|
||||
settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
|
||||
|
||||
1
.github/helm-values/staging.proxy.yaml
vendored
@@ -5,6 +5,7 @@ image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
|
||||
|
||||
120
.github/workflows/benchmarking.yml
vendored
@@ -14,6 +14,13 @@ on:
|
||||
- cron: '36 4 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
environment:
|
||||
description: 'Environment to run remote tests on (dev or staging)'
|
||||
required: false
|
||||
region_id:
|
||||
description: 'Use a particular region. If empty the default one will be used'
|
||||
false: true
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -62,19 +69,12 @@ jobs:
|
||||
echo Pgbench
|
||||
$POSTGRES_DISTRIB_DIR/bin/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
# after massive changes dynamic cluster setup will be revived.
|
||||
# So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
|
||||
- name: Setup cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
run: |
|
||||
set -e
|
||||
|
||||
echo "Starting cluster"
|
||||
# wake up the cluster
|
||||
$POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
environment: ${{ github.event.inputs.environment || 'staging' }}
|
||||
api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }}
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
@@ -97,7 +97,7 @@ jobs:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
PLATFORM: "neon-staging"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
run: |
|
||||
# just to be sure that no data was cached on self hosted runner
|
||||
@@ -115,6 +115,14 @@ jobs:
|
||||
run: |
|
||||
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
environment: staging
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
@@ -131,11 +139,14 @@ jobs:
|
||||
POSTGRES_DISTRIB_DIR: /usr
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: true
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
|
||||
# neon-captest: Run pgbench, reusing existing project
|
||||
# neon-captest-new: Same, but on a freshly created project
|
||||
platform: [ neon-captest, neon-captest-new, rds-aurora ]
|
||||
|
||||
runs-on: dev
|
||||
container:
|
||||
@@ -147,38 +158,55 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Calculate platform
|
||||
id: calculate-platform
|
||||
env:
|
||||
CONNSTR: ${{ matrix.connstr }}
|
||||
run: |
|
||||
if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then
|
||||
PLATFORM=neon-captest
|
||||
elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then
|
||||
PLATFORM=rds-aurora
|
||||
else
|
||||
echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "::set-output name=PLATFORM::${PLATFORM}"
|
||||
|
||||
- name: Install Deps
|
||||
run: |
|
||||
sudo apt -y update
|
||||
sudo apt install -y postgresql-14
|
||||
|
||||
- name: Create Neon Project
|
||||
if: matrix.platform == 'neon-captest-new'
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
environment: ${{ github.event.inputs.environment || 'dev' }}
|
||||
api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=connstr::${CONNSTR}"
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
@@ -188,25 +216,25 @@ jobs:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Benchmark simple-update
|
||||
- name: Benchmark select-only
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
@@ -216,11 +244,19 @@ jobs:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ matrix.platform == 'neon-captest-new' && always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
environment: dev
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_CAPTEST_API_KEY }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
33
.github/workflows/build_and_test.yml
vendored
@@ -54,7 +54,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
@@ -100,11 +99,11 @@ jobs:
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES=""
|
||||
CARGO_FLAGS="--locked"
|
||||
CARGO_FLAGS="--locked --timings"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features profiling"
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||
@@ -126,8 +125,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||
v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -215,9 +214,20 @@ jobs:
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Prepare cargo build timing stats for storing
|
||||
run: |
|
||||
mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/"
|
||||
cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/"
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: Upload cargo build stats
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats
|
||||
path: /tmp/neon/cargo-timings/
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
@@ -233,7 +243,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -245,7 +254,6 @@ jobs:
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
@@ -269,7 +277,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -281,7 +288,6 @@ jobs:
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
@@ -341,7 +347,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -358,12 +363,12 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Get coverage artifact
|
||||
@@ -412,7 +417,7 @@ jobs:
|
||||
trigger-e2e-tests:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ build-neon ]
|
||||
steps:
|
||||
|
||||
71
.github/workflows/codestyle.yml
vendored
@@ -24,12 +24,12 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [1.58]
|
||||
# XXX: both OSes have rustup
|
||||
# * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools
|
||||
# * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
|
||||
# this is all we need to install our toolchain later via rust-toolchain.toml
|
||||
# so don't install any toolchain explicitly.
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
# To support several Postgres versions, add them here.
|
||||
postgres_version: [v14, v15]
|
||||
timeout-minutes: 60
|
||||
name: check codestyle rust and postgres
|
||||
runs-on: ${{ matrix.os }}
|
||||
@@ -41,14 +41,6 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install rust toolchain ${{ matrix.rust_toolchain }}
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: ${{ matrix.rust_toolchain }}
|
||||
components: rustfmt, clippy
|
||||
override: true
|
||||
|
||||
- name: Check formatting
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
@@ -62,17 +54,29 @@ jobs:
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison openssl
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}})
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Cache postgres ${{matrix.postgres_version}} build
|
||||
id: cache_pg
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
pg_install/${{matrix.postgres_version}}
|
||||
key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: matrix.os == 'macos-latest'
|
||||
@@ -80,24 +84,19 @@ jobs:
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: make postgres
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext
|
||||
|
||||
# Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
|
||||
# and the real cause will be inside config.log
|
||||
- name: Print configure logs in case of failure
|
||||
if: failure()
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo ''
|
||||
cat pg_install/build/${{matrix.postgres_version}}/config.log
|
||||
echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo ''
|
||||
cat pg_install/build/${{matrix.postgres_version}}/configure.log
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
uses: actions/cache@v3
|
||||
@@ -107,7 +106,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
|
||||
key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
18
.github/workflows/pg_clients.yml
vendored
@@ -47,11 +47,17 @@ jobs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
environment: staging
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
REMOTE_ENV: 1
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
@@ -65,6 +71,14 @@ jobs:
|
||||
-m "remote_cluster" \
|
||||
-rA "test_runner/pg_clients"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
environment: staging
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
|
||||
29
Cargo.lock
generated
@@ -183,9 +183,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.2.7"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635"
|
||||
checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -193,6 +193,8 @@ dependencies = [
|
||||
"http",
|
||||
"http-body",
|
||||
"mime",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1831,6 +1833,8 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -1871,6 +1875,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -2045,7 +2050,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.2"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2058,7 +2063,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"byteorder",
|
||||
@@ -2076,7 +2081,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.3"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3292,7 +3297,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.6"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -3481,9 +3486,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.34"
|
||||
version = "0.1.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09"
|
||||
checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"log",
|
||||
@@ -3505,11 +3510,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.26"
|
||||
version = "0.1.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f"
|
||||
checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"once_cell",
|
||||
"valuable",
|
||||
]
|
||||
|
||||
@@ -3626,6 +3631,7 @@ name = "utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -3653,6 +3659,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
|
||||
@@ -70,4 +70,4 @@ lto = true
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
|
||||
16
Dockerfile
@@ -22,7 +22,7 @@ RUN set -e \
|
||||
&& rm -rf pg_install/v15/build \
|
||||
&& tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
# Build neon binaries
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
@@ -60,12 +60,12 @@ RUN set -e \
|
||||
openssl \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data zenith \
|
||||
&& chown -R zenith:zenith /data
|
||||
&& useradd -d /data neon \
|
||||
&& chown -R neon:neon /data
|
||||
|
||||
COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
# v14 is default for now
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/
|
||||
@@ -73,7 +73,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \
|
||||
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoints=['http://etcd:2379']" \
|
||||
@@ -82,7 +82,7 @@ RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
VOLUME ["/data"]
|
||||
USER zenith
|
||||
USER neon
|
||||
EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -22,7 +22,7 @@ FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
|
||||
#
|
||||
# Image with Postgres build deps
|
||||
#
|
||||
FROM debian:buster-slim AS build-deps
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
@@ -59,7 +59,7 @@ WORKDIR /pg
|
||||
#
|
||||
# Final compute node image to be exported
|
||||
#
|
||||
FROM debian:buster-slim
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# libreadline-dev is required to run psql
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
|
||||
|
||||
6
Makefile
@@ -34,6 +34,12 @@ ifeq ($(UNAME_S),Darwin)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
endif
|
||||
|
||||
# Use -C option so that when PostgreSQL "make install" installs the
|
||||
# headers, the mtime of the headers are not changed when there have
|
||||
# been no changes to the files. Changing the mtime triggers an
|
||||
# unnecessary rebuild of 'postgres_ffi'.
|
||||
PG_CONFIGURE_OPTS += INSTALL='install -C'
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
# Fix for a corner case when make doesn't pass a jobserver
|
||||
|
||||
35
README.md
@@ -69,6 +69,17 @@ brew install libpq
|
||||
brew link --force libpq
|
||||
```
|
||||
|
||||
#### Rustc version
|
||||
|
||||
The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds.
|
||||
|
||||
This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
|
||||
|
||||
rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
|
||||
|
||||
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
|
||||
Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
|
||||
|
||||
#### Building on Linux
|
||||
|
||||
1. Build neon and patched postgres
|
||||
@@ -78,9 +89,9 @@ brew link --force libpq
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. If you want to use a release
|
||||
# build, utilize "BUILD_TYPE=release make -j`nproc`"
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`nproc`"
|
||||
|
||||
make -j`nproc`
|
||||
```
|
||||
@@ -94,9 +105,9 @@ make -j`nproc`
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. If you want to use a release
|
||||
# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||
|
||||
make -j`sysctl -n hw.logicalcpu`
|
||||
```
|
||||
@@ -114,16 +125,18 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
||||
# Create repository in .neon with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
|
||||
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
|
||||
pageserver init succeeded
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||
|
||||
Pageserver started
|
||||
Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
|
||||
Stopping pageserver gracefully...done!
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting etcd broker using /usr/bin/etcd
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||
|
||||
Pageserver started
|
||||
initializing for sk 1 for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
|
||||
Safekeeper started
|
||||
|
||||
|
||||
@@ -10,12 +10,12 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
url = "2.2.2"
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -8,7 +8,7 @@ clap = "3.0"
|
||||
comfy-table = "5.0.1"
|
||||
git-version = "0.3.5"
|
||||
tar = "0.4.38"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
|
||||
# Minimal neon environment with one safekeeper. This is equivalent to the built-in
|
||||
# defaults that you get with no --config
|
||||
[pageserver]
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
|
||||
@@ -27,10 +27,10 @@ use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
// Default id of a safekeeper node, if not specified on the command line.
|
||||
@@ -72,7 +72,7 @@ struct TimelineTreeEl {
|
||||
/// Name, recovered from neon config mappings
|
||||
pub name: Option<String>,
|
||||
/// Holds all direct children of this timeline referenced using `timeline_id`.
|
||||
pub children: BTreeSet<ZTimelineId>,
|
||||
pub children: BTreeSet<TimelineId>,
|
||||
}
|
||||
|
||||
// Main entry point for the 'neon_local' CLI utility
|
||||
@@ -321,7 +321,7 @@ fn main() -> Result<()> {
|
||||
///
|
||||
fn print_timelines_tree(
|
||||
timelines: Vec<TimelineInfo>,
|
||||
mut timeline_name_mappings: HashMap<ZTenantTimelineId, String>,
|
||||
mut timeline_name_mappings: HashMap<TenantTimelineId, String>,
|
||||
) -> Result<()> {
|
||||
let mut timelines_hash = timelines
|
||||
.iter()
|
||||
@@ -332,7 +332,7 @@ fn print_timelines_tree(
|
||||
info: t.clone(),
|
||||
children: BTreeSet::new(),
|
||||
name: timeline_name_mappings
|
||||
.remove(&ZTenantTimelineId::new(t.tenant_id, t.timeline_id)),
|
||||
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
|
||||
},
|
||||
)
|
||||
})
|
||||
@@ -374,7 +374,7 @@ fn print_timeline(
|
||||
nesting_level: usize,
|
||||
is_last: &[bool],
|
||||
timeline: &TimelineTreeEl,
|
||||
timelines: &HashMap<ZTimelineId, TimelineTreeEl>,
|
||||
timelines: &HashMap<TimelineId, TimelineTreeEl>,
|
||||
) -> Result<()> {
|
||||
let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) {
|
||||
(None, None) => unreachable!("in this case no info for a timeline is found"),
|
||||
@@ -452,8 +452,8 @@ fn print_timeline(
|
||||
/// Connects to the pageserver to query this information.
|
||||
fn get_timeline_infos(
|
||||
env: &local_env::LocalEnv,
|
||||
tenant_id: &ZTenantId,
|
||||
) -> Result<HashMap<ZTimelineId, TimelineInfo>> {
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
||||
Ok(PageServerNode::from_env(env)
|
||||
.timeline_list(tenant_id)?
|
||||
.into_iter()
|
||||
@@ -462,7 +462,7 @@ fn get_timeline_infos(
|
||||
}
|
||||
|
||||
// Helper function to parse --tenant_id option, or get the default from config file
|
||||
fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<ZTenantId> {
|
||||
fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<TenantId> {
|
||||
if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() {
|
||||
tenant_id_from_arguments
|
||||
} else if let Some(default_id) = env.default_tenant_id {
|
||||
@@ -472,18 +472,18 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTenantId>> {
|
||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
sub_match
|
||||
.value_of("tenant-id")
|
||||
.map(ZTenantId::from_str)
|
||||
.map(TenantId::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse tenant id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTimelineId>> {
|
||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||
sub_match
|
||||
.value_of("timeline-id")
|
||||
.map(ZTimelineId::from_str)
|
||||
.map(TimelineId::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse timeline id from the argument string")
|
||||
}
|
||||
@@ -504,9 +504,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
let mut env =
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
env.init().context("Failed to initialize neon repository")?;
|
||||
|
||||
// default_tenantid was generated by the `env.init()` call above
|
||||
let initial_tenant_id = env.default_tenant_id.unwrap();
|
||||
let initial_tenant_id = env
|
||||
.default_tenant_id
|
||||
.expect("default_tenant_id should be generated by the `env.init()` call above");
|
||||
|
||||
// Initialize pageserver, create initial tenant and timeline.
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
@@ -543,13 +543,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
match tenant_match.subcommand() {
|
||||
Some(("list", _)) => {
|
||||
for t in pageserver.tenant_list()? {
|
||||
println!(
|
||||
"{} {}",
|
||||
t.id,
|
||||
t.state
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| String::from(""))
|
||||
);
|
||||
println!("{} {:?}", t.id, t.state);
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
@@ -765,7 +759,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
};
|
||||
|
||||
let branch_name = timeline_name_mappings
|
||||
.get(&ZTenantTimelineId::new(tenant_id, node.timeline_id))
|
||||
.get(&TenantTimelineId::new(tenant_id, node.timeline_id))
|
||||
.map(|name| name.as_str())
|
||||
.unwrap_or("?");
|
||||
|
||||
@@ -816,7 +810,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
|
||||
let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
|
||||
|
||||
let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) {
|
||||
let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||
|
||||
Some(env.generate_auth_token(&claims)?)
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::time::Duration;
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
@@ -28,7 +28,7 @@ use crate::storage::PageServerNode;
|
||||
pub struct ComputeControlPlane {
|
||||
base_port: u16,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
pub nodes: BTreeMap<(ZTenantId, String), Arc<PostgresNode>>,
|
||||
pub nodes: BTreeMap<(TenantId, String), Arc<PostgresNode>>,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
@@ -76,9 +76,9 @@ impl ComputeControlPlane {
|
||||
|
||||
pub fn new_node(
|
||||
&mut self,
|
||||
tenant_id: ZTenantId,
|
||||
tenant_id: TenantId,
|
||||
name: &str,
|
||||
timeline_id: ZTimelineId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
port: Option<u16>,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
@@ -114,9 +114,9 @@ pub struct PostgresNode {
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenant_id: ZTenantId,
|
||||
pub tenant_id: TenantId,
|
||||
uses_wal_proposer: bool,
|
||||
}
|
||||
|
||||
@@ -148,8 +148,8 @@ impl PostgresNode {
|
||||
// Read a few options from the config file
|
||||
let context = format!("in config file {}", cfg_path_str);
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
|
||||
let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
|
||||
let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
|
||||
|
||||
// parse recovery_target_lsn, if any
|
||||
@@ -292,7 +292,7 @@ impl PostgresNode {
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::ZenithJWT = auth_type {
|
||||
let password = if let AuthType::NeonJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
@@ -301,7 +301,7 @@ impl PostgresNode {
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
|
||||
// We parse this string and build it back with token from env var, and for simplicity rebuild
|
||||
// uses only needed variables namely host, port, user, password.
|
||||
format!("postgresql://no_user:{}@{}:{}", password, host, port)
|
||||
format!("postgresql://no_user:{password}@{host}:{port}")
|
||||
};
|
||||
conf.append("shared_preload_libraries", "neon");
|
||||
conf.append_line("");
|
||||
|
||||
@@ -14,8 +14,8 @@ use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
postgres_backend::AuthType,
|
||||
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
@@ -48,13 +48,13 @@ pub struct LocalEnv {
|
||||
|
||||
// Path to pageserver binary.
|
||||
#[serde(default)]
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
pub neon_distrib_dir: PathBuf,
|
||||
|
||||
// Default tenant ID to use with the 'zenith' command line utility, when
|
||||
// --tenantid is not explicitly specified.
|
||||
// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
// --tenant_id is not explicitly specified.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub default_tenant_id: Option<ZTenantId>,
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
#[serde(default)]
|
||||
@@ -69,11 +69,11 @@ pub struct LocalEnv {
|
||||
|
||||
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
|
||||
#[serde(default)]
|
||||
// A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
#[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
|
||||
branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
/// Etcd broker config for cluster internal communication.
|
||||
@@ -204,20 +204,20 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.zenith_distrib_dir.join("pageserver"))
|
||||
Ok(self.neon_distrib_dir.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.zenith_distrib_dir.join("safekeeper"))
|
||||
Ok(self.neon_distrib_dir.join("safekeeper"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs").join("tenants")
|
||||
}
|
||||
|
||||
pub fn pg_data_dir(&self, tenantid: &ZTenantId, branch_name: &str) -> PathBuf {
|
||||
pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf {
|
||||
self.pg_data_dirs_path()
|
||||
.join(tenantid.to_string())
|
||||
.join(tenant_id.to_string())
|
||||
.join(branch_name)
|
||||
}
|
||||
|
||||
@@ -233,8 +233,8 @@ impl LocalEnv {
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let existing_values = self
|
||||
.branch_name_mappings
|
||||
@@ -260,22 +260,22 @@ impl LocalEnv {
|
||||
pub fn get_branch_timeline_id(
|
||||
&self,
|
||||
branch_name: &str,
|
||||
tenant_id: ZTenantId,
|
||||
) -> Option<ZTimelineId> {
|
||||
tenant_id: TenantId,
|
||||
) -> Option<TimelineId> {
|
||||
self.branch_name_mappings
|
||||
.get(branch_name)?
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(ZTimelineId::from)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<ZTenantTimelineId, String> {
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
self.branch_name_mappings
|
||||
.iter()
|
||||
.flat_map(|(name, tenant_timelines)| {
|
||||
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
|
||||
(ZTenantTimelineId::new(tenant_id, timeline_id), name.clone())
|
||||
(TenantTimelineId::new(tenant_id, timeline_id), name.clone())
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
@@ -299,14 +299,14 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
// Find zenith binaries.
|
||||
if env.zenith_distrib_dir == Path::new("") {
|
||||
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
// Find neon binaries.
|
||||
if env.neon_distrib_dir == Path::new("") {
|
||||
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenant_id.is_none() {
|
||||
env.default_tenant_id = Some(ZTenantId::generate());
|
||||
env.default_tenant_id = Some(TenantId::generate());
|
||||
}
|
||||
|
||||
env.base_data_dir = base_path();
|
||||
@@ -320,12 +320,12 @@ impl LocalEnv {
|
||||
|
||||
if !repopath.exists() {
|
||||
bail!(
|
||||
"Zenith config is not found in {}. You need to run 'neon_local init' first",
|
||||
"Neon config is not found in {}. You need to run 'neon_local init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
// TODO: check that it looks like a neon repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
@@ -404,10 +404,10 @@ impl LocalEnv {
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
if !self.neon_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{binary}' in zenith distrib dir '{}'",
|
||||
self.zenith_distrib_dir.display()
|
||||
"Can't find binary '{binary}' in neon distrib dir '{}'",
|
||||
self.neon_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
/// Module for parsing postgresql.conf file.
|
||||
///
|
||||
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
|
||||
/// enough to extract a few settings we need in Zenith, assuming you don't do
|
||||
/// enough to extract a few settings we need in Neon, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{bail, Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
@@ -17,7 +17,7 @@ use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
@@ -269,7 +269,7 @@ impl SafekeeperNode {
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::ZenithJWT {
|
||||
//if self.env.auth_type == AuthType::NeonJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
//}
|
||||
self.http_client.request(method, url)
|
||||
@@ -284,8 +284,8 @@ impl SafekeeperNode {
|
||||
|
||||
pub fn timeline_create(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
peer_ids: Vec<NodeId>,
|
||||
) -> Result<()> {
|
||||
Ok(self
|
||||
|
||||
@@ -21,9 +21,9 @@ use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
@@ -83,7 +83,7 @@ pub struct PageServerNode {
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
let password = if env.pageserver.auth_type == AuthType::NeonJWT {
|
||||
&env.pageserver.auth_token
|
||||
} else {
|
||||
""
|
||||
@@ -109,10 +109,10 @@ impl PageServerNode {
|
||||
|
||||
pub fn initialize(
|
||||
&self,
|
||||
create_tenant: Option<ZTenantId>,
|
||||
initial_timeline_id: Option<ZTimelineId>,
|
||||
create_tenant: Option<TenantId>,
|
||||
initial_timeline_id: Option<TimelineId>,
|
||||
config_overrides: &[&str],
|
||||
) -> anyhow::Result<ZTimelineId> {
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
let id = format!("id={}", self.env.pageserver.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param =
|
||||
@@ -173,9 +173,9 @@ impl PageServerNode {
|
||||
|
||||
fn try_init_timeline(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
new_timeline_id: Option<ZTimelineId>,
|
||||
) -> anyhow::Result<ZTimelineId> {
|
||||
new_tenant_id: Option<TenantId>,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
|
||||
let initial_timeline_info =
|
||||
self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?;
|
||||
@@ -345,7 +345,7 @@ impl PageServerNode {
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
let mut builder = self.http_client.request(method, url);
|
||||
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
if self.env.pageserver.auth_type == AuthType::NeonJWT {
|
||||
builder = builder.bearer_auth(&self.env.pageserver.auth_token)
|
||||
}
|
||||
builder
|
||||
@@ -368,9 +368,9 @@ impl PageServerNode {
|
||||
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
new_tenant_id: Option<TenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<ZTenantId> {
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let mut settings = settings.clone();
|
||||
let request = TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
@@ -416,7 +416,7 @@ impl PageServerNode {
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {:?}", settings)
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&request)
|
||||
@@ -434,7 +434,7 @@ impl PageServerNode {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
@@ -483,7 +483,7 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
Method::GET,
|
||||
@@ -498,10 +498,10 @@ impl PageServerNode {
|
||||
|
||||
pub fn timeline_create(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
new_timeline_id: Option<ZTimelineId>,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<ZTimelineId>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
@@ -536,8 +536,8 @@ impl PageServerNode {
|
||||
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||
pub fn timeline_import(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
base: (Lsn, PathBuf),
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -79,4 +79,5 @@
|
||||
- [014-storage-lsm](rfcs/014-storage-lsm.md)
|
||||
- [015-storage-messaging](rfcs/015-storage-messaging.md)
|
||||
- [016-connection-routing](rfcs/016-connection-routing.md)
|
||||
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
|
||||
- [cluster-size-limits](rfcs/cluster-size-limits.md)
|
||||
|
||||
@@ -2,14 +2,14 @@
|
||||
|
||||
### Overview
|
||||
|
||||
Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `zenith init`. Using following openssl commands:
|
||||
Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `neon_local init`. Using following openssl commands:
|
||||
|
||||
```bash
|
||||
openssl genrsa -out private_key.pem 2048
|
||||
openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
|
||||
```
|
||||
|
||||
CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `ZenithJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file.
|
||||
CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `NeonJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file.
|
||||
|
||||
Currently there is no authentication between compute and safekeepers, because this communication layer is under heavy refactoring. After this refactoring support for authentication will be added there too. Now safekeeper supports "hardcoded" token passed via environment variable to be able to use callmemaybe command in pageserver.
|
||||
|
||||
|
||||
@@ -2,26 +2,26 @@
|
||||
|
||||
### Overview
|
||||
|
||||
Zenith supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via zenith CLI. During page server setup tenant can be created using ```zenith init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```zenith tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So zenith tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant.
|
||||
Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant.
|
||||
|
||||
### Tenants in other commands
|
||||
|
||||
By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=<tenantid>` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.
|
||||
By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=<tenant_id>` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.
|
||||
|
||||
Examples for cli:
|
||||
|
||||
```sh
|
||||
zenith tenant list
|
||||
neon_local tenant list
|
||||
|
||||
zenith tenant create // generates new id
|
||||
neon_local tenant create // generates new id
|
||||
|
||||
zenith tenant create ee6016ec31116c1b7c33dfdfca38892f
|
||||
neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f
|
||||
|
||||
zenith pg create main // default tenant from zenith init
|
||||
neon_local pg create main // default tenant from neon init
|
||||
|
||||
zenith pg create main --tenantid=ee6016ec31116c1b7c33dfdfca38892f
|
||||
neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f
|
||||
|
||||
zenith branch --tenantid=ee6016ec31116c1b7c33dfdfca38892f
|
||||
neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f
|
||||
```
|
||||
|
||||
### Data layout
|
||||
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id
|
||||
|
||||
### Safety
|
||||
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id).
|
||||
|
||||
@@ -109,7 +109,7 @@ Repository
|
||||
|
||||
The repository stores all the page versions, or WAL records needed to
|
||||
reconstruct them. Each tenant has a separate Repository, which is
|
||||
stored in the .neon/tenants/<tenantid> directory.
|
||||
stored in the .neon/tenants/<tenant_id> directory.
|
||||
|
||||
Repository is an abstract trait, defined in `repository.rs`. It is
|
||||
implemented by the LayeredRepository object in
|
||||
|
||||
@@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
`.neon/tenants/<tenantid>/timelines`.
|
||||
`.neon/tenants/<tenant_id>/timelines`.
|
||||
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
@@ -351,7 +351,7 @@ branch.
|
||||
Note: It doesn't make any difference if the child branch is created
|
||||
when the end of the main branch was at LSN 250, or later when the tip of
|
||||
the main branch had already moved on. The latter case, creating a
|
||||
branch at a historic LSN, is how we support PITR in Zenith.
|
||||
branch at a historic LSN, is how we support PITR in Neon.
|
||||
|
||||
|
||||
# Garbage collection
|
||||
@@ -396,9 +396,9 @@ table:
|
||||
main/orders_200_300 DELETE
|
||||
main/orders_300 STILL NEEDED BY orders_300_400
|
||||
main/orders_300_400 KEEP, NEWER THAN GC HORIZON
|
||||
main/orders_400 ..
|
||||
main/orders_400_500 ..
|
||||
main/orders_500 ..
|
||||
main/orders_400 ..
|
||||
main/orders_400_500 ..
|
||||
main/orders_500 ..
|
||||
main/customers_100 DELETE
|
||||
main/customers_100_200 DELETE
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
|
||||
@@ -9,7 +9,7 @@ This feature allows to migrate a timeline from one pageserver to another by util
|
||||
Pageserver implements two new http handlers: timeline attach and timeline detach.
|
||||
Timeline migration is performed in a following way:
|
||||
1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3.
|
||||
2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049))
|
||||
2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049))
|
||||
3. Replication state can be tracked via timeline detail pageserver call.
|
||||
4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console).
|
||||
5. Timeline is detached from old pageserver. On disk data is removed.
|
||||
@@ -18,5 +18,5 @@ Timeline migration is performed in a following way:
|
||||
### Implementation details
|
||||
|
||||
Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code:
|
||||
* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
|
||||
* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
|
||||
* We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail)
|
||||
|
||||
@@ -1,26 +1,39 @@
|
||||
## Thread management
|
||||
|
||||
Each thread in the system is tracked by the `thread_mgr` module. It
|
||||
maintains a registry of threads, and which tenant or timeline they are
|
||||
operating on. This is used for safe shutdown of a tenant, or the whole
|
||||
system.
|
||||
The pageserver uses Tokio for handling concurrency. Everything runs in
|
||||
Tokio tasks, although some parts are written in blocking style and use
|
||||
spawn_blocking().
|
||||
|
||||
Each Tokio task is tracked by the `task_mgr` module. It maintains a
|
||||
registry of tasks, and which tenant or timeline they are operating
|
||||
on.
|
||||
|
||||
### Handling shutdown
|
||||
|
||||
When a tenant or timeline is deleted, we need to shut down all threads
|
||||
operating on it, before deleting the data on disk. A thread registered
|
||||
in the thread registry can check if it has been requested to shut down,
|
||||
by calling `is_shutdown_requested()`. For async operations, there's also
|
||||
a `shudown_watcher()` async task that can be used to wake up on shutdown.
|
||||
When a tenant or timeline is deleted, we need to shut down all tasks
|
||||
operating on it, before deleting the data on disk. There's a function,
|
||||
`shutdown_tasks`, to request all tasks of a particular tenant or
|
||||
timeline to shutdown. It will also wait for them to finish.
|
||||
|
||||
A task registered in the task registry can check if it has been
|
||||
requested to shut down, by calling `is_shutdown_requested()`. There's
|
||||
also a `shudown_watcher()` Future that can be used with `tokio::select!`
|
||||
or similar, to wake up on shutdown.
|
||||
|
||||
|
||||
### Sync vs async
|
||||
|
||||
The primary programming model in the page server is synchronous,
|
||||
blocking code. However, there are some places where async code is
|
||||
used. Be very careful when mixing sync and async code.
|
||||
|
||||
Async is primarily used to wait for incoming data on network
|
||||
connections. For example, all WAL receivers have a shared thread pool,
|
||||
with one async Task for each connection. Once a piece of WAL has been
|
||||
received from the network, the thread calls the blocking functions in
|
||||
We use async to wait for incoming data on network connections, and to
|
||||
perform other long-running operations. For example, each WAL receiver
|
||||
connection is handled by a tokio Task. Once a piece of WAL has been
|
||||
received from the network, the task calls the blocking functions in
|
||||
the Repository to process the WAL.
|
||||
|
||||
The core storage code in `layered_repository/` is synchronous, with
|
||||
blocking locks and I/O calls. The current model is that we consider
|
||||
disk I/Os to be short enough that we perform them while running in a
|
||||
Tokio task. If that becomes a problem, we should use `spawn_blocking`
|
||||
before entering the synchronous parts of the code, or switch to using
|
||||
tokio I/O functions.
|
||||
|
||||
Be very careful when mixing sync and async code!
|
||||
|
||||
@@ -70,7 +70,7 @@ two options.
|
||||
|
||||
...start sending WAL conservatively since the horizon (1.1), and truncate
|
||||
obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is
|
||||
reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes.
|
||||
reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes.
|
||||
|
||||
Then the following is possible:
|
||||
|
||||
|
||||
413
docs/rfcs/017-timeline-data-management.md
Normal file
@@ -0,0 +1,413 @@
|
||||
# Name
|
||||
|
||||
Tenant and timeline data management in pageserver
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them.
|
||||
|
||||
The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either.
|
||||
Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state.
|
||||
|
||||
RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing.
|
||||
|
||||
## Motivation
|
||||
|
||||
In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access.
|
||||
The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent.
|
||||
As the number of modules and isolated data grows per timeline, more questions and corner cases arise:
|
||||
|
||||
- https://github.com/neondatabase/neon/issues/1559
|
||||
right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently.
|
||||
|
||||
- https://github.com/neondatabase/neon/issues/1751
|
||||
GC and compaction file activities are not well known outside their tasks code, causing race bugs
|
||||
|
||||
- https://github.com/neondatabase/neon/issues/2003
|
||||
Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs
|
||||
|
||||
- more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues.
|
||||
For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic.
|
||||
In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data.
|
||||
|
||||
On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files.
|
||||
The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state.
|
||||
|
||||
Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is.
|
||||
We have to develop a vision on a number of topics before progressing safely:
|
||||
|
||||
- timeline and tenant data structure and how should we access it
|
||||
- sync and async worlds and in what way that should evolve
|
||||
- unit tests for the complex logic
|
||||
|
||||
This RFC aims to provide a general overview of the existing situation and propose ways to improve it.
|
||||
The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later.
|
||||
|
||||
## What is a timeline and its data
|
||||
|
||||
First, we need to define what data we want to manage per timeline.
|
||||
Currently, the data every timeline operates is:
|
||||
|
||||
- a set of layer files, on the FS
|
||||
|
||||
Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion.
|
||||
|
||||
- a set of layer files, on the remote storage
|
||||
|
||||
Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread
|
||||
|
||||
- a `metadata` file, on the FS
|
||||
|
||||
Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart.
|
||||
Also contains data about the ancestor, if the timeline was branched off another timeline.
|
||||
|
||||
- an `index_part.json` file, on the remote storage
|
||||
|
||||
Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline.
|
||||
Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files.
|
||||
|
||||
- LayerMap and PageCache, in memory
|
||||
|
||||
Dynamic, used to store and retrieve the page data to users.
|
||||
|
||||
- timeline info, in memory
|
||||
|
||||
LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes.
|
||||
|
||||
- metrics data, in memory
|
||||
|
||||
Data to push or provide to Prometheus, Opentelemetry, etc.
|
||||
|
||||
Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers.
|
||||
|
||||
Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents.
|
||||
Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant.
|
||||
`tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers.
|
||||
|
||||
Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts.
|
||||
It's not yet anyhow synchronized with the remote storage, so only exists on the local FS.
|
||||
|
||||
### How the data is stored
|
||||
|
||||
We have multiple places where timeline data is stored:
|
||||
|
||||
- `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>>` with the `Tenant` having the `local_timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>` inside
|
||||
|
||||
- same `Tenant` above has actually two references to timelines: another via its `repo: Arc<RepositoryImpl>` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex<HashMap<ZTimelineId, LayeredTimelineEntry>>`
|
||||
|
||||
- `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries
|
||||
|
||||
- `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any)
|
||||
|
||||
- `PageCache` contains timeline-related data, and is created globally for the whole pageserver
|
||||
|
||||
- implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time.
|
||||
|
||||
On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first.
|
||||
|
||||
Based on this, a high-level pageserver's module diagram with data and entities could be:
|
||||
|
||||

|
||||
|
||||
A few comments on the diagram:
|
||||
|
||||
- the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below)
|
||||
|
||||
It aims to show main data and means of synchronizing it.
|
||||
|
||||
- modules tend to isolate their data inside and provide access to it via API
|
||||
|
||||
Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap<Id, Data>`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening.
|
||||
|
||||
- part of the modules is asynchronous, while the other is not, that complicates the data access
|
||||
|
||||
Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking.
|
||||
|
||||
Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread.
|
||||
|
||||
Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking.
|
||||
|
||||
Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done.
|
||||
|
||||
- locks are used in two different ways:
|
||||
|
||||
- `RwLock<HashMap<..>>` ones to hold the shared data and ensure its atomic updates
|
||||
- `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access
|
||||
|
||||
The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as
|
||||
`latest_gc_cutoff_lsn: RwLock<Lsn>` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams.
|
||||
|
||||
- some synchronizations are not yet implemented
|
||||
|
||||
E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage.
|
||||
That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues.
|
||||
|
||||
- `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc.
|
||||
|
||||
### How is this data accessed?
|
||||
|
||||
There are multiple ways the data is accessed, from different sources:
|
||||
|
||||
1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs)
|
||||
|
||||
High-level CRUD API for managing tenants, timelines and getting data about them.
|
||||
Current API list (modified for readability):
|
||||
|
||||
```rust
|
||||
.get("/v1/status", status_handler) // pageserver status
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones
|
||||
.get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata
|
||||
.put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler)
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata
|
||||
```
|
||||
|
||||
Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata.
|
||||
|
||||
`GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon.
|
||||
It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock.
|
||||
|
||||
Non-`GET` operations mostly follow the same rule, with two differences:
|
||||
|
||||
- `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks
|
||||
- `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too.
|
||||
This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory.
|
||||
|
||||
"Timeline data synchronization" section below describes both complex cases in more details.
|
||||
|
||||
2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs)
|
||||
|
||||
Is the main interface of pageserver, intended to handle libpq (and similar) requests.
|
||||
Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors.
|
||||
|
||||
- `pagestream`
|
||||
|
||||
Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size`
|
||||
|
||||
Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout.
|
||||
|
||||
- `basebackup` and `fullbackup`
|
||||
|
||||
Options to generate postgres-compatible backup archives.
|
||||
|
||||
- `import basebackup`
|
||||
|
||||
- `import wal`
|
||||
|
||||
Import the `pg_wal` section of the basebackup archive.
|
||||
|
||||
- `get_last_record_rlsn`, `get_lsn_by_timestamp`
|
||||
|
||||
"Metadata" retrieval methods, that still requires internal knowledge about layers.
|
||||
|
||||
- `set`, `fallpoints`, `show`
|
||||
|
||||
Utility methods to support various edge cases or help with debugging/testing.
|
||||
|
||||
- `do_gc`, `compact`, `checkpoint`
|
||||
|
||||
Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up.
|
||||
|
||||
Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one.
|
||||
See "Timeline data synchronization" section for the united synchronization diagram on the topic.
|
||||
|
||||
3. internal access
|
||||
|
||||
Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations.
|
||||
Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization.
|
||||
|
||||
If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved.
|
||||
Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on.
|
||||
|
||||
- periodic GC and compaction tasks
|
||||
|
||||
Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk.
|
||||
Same as its libpq counterparts, needs full synchronization with the low level layer management code.
|
||||
|
||||
- storage sync task
|
||||
|
||||
Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository).
|
||||
Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks.
|
||||
|
||||
- walreceiver and walingest task
|
||||
|
||||
Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node.
|
||||
Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task.
|
||||
During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API.
|
||||
|
||||
Layer updates require low-level set of sync primitives used to preserve the data consistency.
|
||||
|
||||
- checkpoint (layer freeze) task
|
||||
|
||||
Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time.
|
||||
|
||||
### Timeline data synchronization
|
||||
|
||||
Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above.
|
||||
|
||||
For brevity, diagrams do not show `RwLock<HashMap<..>>` data accesses, considering them almost instant to happen.
|
||||
`RwLock<LayerMap>` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly.
|
||||
Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code.
|
||||
Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones.
|
||||
To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts.
|
||||
|
||||
`write_lock` synchronization diagram:
|
||||
|
||||

|
||||
|
||||
Comments:
|
||||
|
||||
- `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time
|
||||
- `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory
|
||||
The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations.
|
||||
Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks
|
||||
- `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time
|
||||
|
||||

|
||||
|
||||
Comments:
|
||||
|
||||
- `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for.
|
||||
|
||||
- there are multiple locks that do similar task management operations:
|
||||
- `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock<Lsn>` ensures that branching and gc are not run concurrently
|
||||
- `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently
|
||||
- `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start
|
||||
Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon.
|
||||
|
||||
Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### How to structure timeline data access better
|
||||
|
||||
- adjust tenant state handling
|
||||
|
||||
Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup.
|
||||
|
||||
We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added.
|
||||
The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that.
|
||||
|
||||
Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes.
|
||||
Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive.
|
||||
|
||||
- remove the "unloaded" status for the timeline
|
||||
|
||||
Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493)
|
||||
|
||||
```rust
|
||||
#[derive(Clone)]
|
||||
enum LayeredTimelineEntry {
|
||||
Loaded(Arc<LayeredTimeline>),
|
||||
Unloaded {
|
||||
id: ZTimelineId,
|
||||
metadata: TimelineMetadata,
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
supposes that timelines have to be in `Unloaded` state.
|
||||
|
||||
The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded).
|
||||
The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later.
|
||||
|
||||
Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working.
|
||||
Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation.
|
||||
Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory.
|
||||
There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first).
|
||||
|
||||
With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state.
|
||||
|
||||
To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer.
|
||||
|
||||
Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future.
|
||||
|
||||
This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover.
|
||||
|
||||
- scale down the remote storage sync to per layer file, not per timeline as now
|
||||
|
||||
Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed.
|
||||
Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage.
|
||||
With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested.
|
||||
|
||||
Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities.
|
||||
Yet the only place using remote storage should be the layer map.
|
||||
|
||||
- encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently
|
||||
|
||||
[`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents.
|
||||
|
||||
`tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs.
|
||||
To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place.
|
||||
One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place.
|
||||
|
||||
- move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.)
|
||||
|
||||
Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory.
|
||||
`LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`.
|
||||
In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS.
|
||||
Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable.
|
||||
|
||||
One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463)
|
||||
The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed.
|
||||
Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests.
|
||||
|
||||
- change the approach to locking synchronization
|
||||
|
||||
A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes.
|
||||
It should be done in a task manager or other place, external to the timeline.
|
||||
|
||||
Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock<Lsn>` are expected to stay for that purpose, but general amount of locks should be reduced.
|
||||
|
||||
### Putting it all together
|
||||
|
||||
If the proposal bullets applied to the diagrams above, the state could be represented as:
|
||||
|
||||

|
||||
|
||||
The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible.
|
||||
This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way,
|
||||
more convenient for the data sync system inside.
|
||||
So far, it seems that a few maps with `Arc<RwLock<SeparateData>>` with actual data operations added inside each `SeparateData` struct, if needed.
|
||||
|
||||
`page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking.
|
||||
|
||||
`task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks.
|
||||
All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization.
|
||||
Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar.
|
||||
|
||||
`task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...)
|
||||
Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently.
|
||||
|
||||
`LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities.
|
||||
Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not.
|
||||
|
||||
Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines.
|
||||
Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization.
|
||||
|
||||

|
||||
|
||||
There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources.
|
||||
|
||||
Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed.
|
||||
|
||||

|
||||
|
||||
Lock diagrams legend:
|
||||
|
||||

|
||||
|
||||
After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way.
|
||||
This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held.
|
||||
|
||||
Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock.
|
||||
It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling.
|
||||
The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request.
|
||||
|
||||
There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently.
|
||||
@@ -15,7 +15,7 @@ The stateless compute node that performs validation is separate from the storage
|
||||
|
||||
Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future).
|
||||
First of all, this is needed to control our free tier production costs.
|
||||
Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters,
|
||||
Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters,
|
||||
so we don't want to give users access to the functionality that we don't think is ready.
|
||||
|
||||
## Components
|
||||
@@ -43,20 +43,20 @@ Then this size should be reported to compute node.
|
||||
|
||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.`
|
||||
|
||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||
(PR about protocol changes https://github.com/neondatabase/neon/pull/1037).
|
||||
|
||||
This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`.
|
||||
|
||||
Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable.
|
||||
|
||||
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
|
||||
And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
|
||||
(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html))
|
||||
|
||||
TODO:
|
||||
We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`.
|
||||
It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level.
|
||||
See issues https://github.com/neondatabase/neon/issues/1245
|
||||
https://github.com/zenithdb/zenith/issues/1445
|
||||
https://github.com/neondatabase/neon/issues/1445
|
||||
|
||||
TODO:
|
||||
We should warn users if the limit is soon to be reached.
|
||||
|
||||
|
After Width: | Height: | Size: 15 KiB |
|
After Width: | Height: | Size: 62 KiB |
|
After Width: | Height: | Size: 33 KiB |
|
After Width: | Height: | Size: 34 KiB |
|
After Width: | Height: | Size: 70 KiB |
|
After Width: | Height: | Size: 49 KiB |
|
After Width: | Height: | Size: 33 KiB |
@@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation
|
||||
|
||||
`/docs`:
|
||||
|
||||
Documentation of the Zenith features and concepts.
|
||||
Documentation of the Neon features and concepts.
|
||||
Now it is mostly dev documentation.
|
||||
|
||||
`/monitoring`:
|
||||
@@ -19,7 +19,7 @@ TODO
|
||||
|
||||
`/pageserver`:
|
||||
|
||||
Zenith storage service.
|
||||
Neon storage service.
|
||||
The pageserver has a few different duties:
|
||||
|
||||
- Store and manage the data.
|
||||
@@ -54,7 +54,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/safekeeper`:
|
||||
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see [walservice.md](./walservice.md)
|
||||
@@ -64,11 +64,6 @@ The workspace_hack crate exists only to pin down some dependencies.
|
||||
|
||||
We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
|
||||
|
||||
`/zenith`
|
||||
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/libs`:
|
||||
Unites granular neon helper crates under the hood.
|
||||
|
||||
@@ -134,3 +129,52 @@ Also consider:
|
||||
To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case.
|
||||
|
||||
More details are available in poetry's [documentation](https://python-poetry.org/docs/).
|
||||
|
||||
## Configuring IDEs
|
||||
Neon consists of three projects in different languages which use different project models.
|
||||
|
||||
* A bunch of Rust crates, all available from the root `Cargo.toml`.
|
||||
* Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well.
|
||||
* Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`.
|
||||
|
||||
### CLion
|
||||
You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though.
|
||||
|
||||
C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database:
|
||||
|
||||
1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet.
|
||||
2. Run the following commands in the repository's root:
|
||||
```bash
|
||||
# Install a `compiledb` tool which can parse make's output and generate the compilation database.
|
||||
poetry add -D compiledb
|
||||
# Clean the build tree so we can rebuild from scratch.
|
||||
# Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new,
|
||||
# so we don't know a way to generate the compilation database without recompiling everything,
|
||||
# see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325
|
||||
make distclean
|
||||
# Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database.
|
||||
# You can alter the -j parameter to your liking.
|
||||
# Note that we only build for a specific version of Postgres. The extension code is shared, but headers are
|
||||
# different, so we set up CLion to only use a specific version of the headers.
|
||||
make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build
|
||||
# Uninstall the tool
|
||||
poetry remove -D compiledb
|
||||
# Make sure the compile_commands.json file is not committed.
|
||||
echo /compile_commands.json >>.git/info/exclude
|
||||
```
|
||||
3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file.
|
||||
4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain).
|
||||
5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code.
|
||||
6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well.
|
||||
7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4.
|
||||
|
||||
You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter.
|
||||
|
||||
Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically.
|
||||
|
||||
Known issues (fixes and suggestions are welcome):
|
||||
|
||||
* Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead.
|
||||
* CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited.
|
||||
* Cargo Clippy diagnostics in CLion may take a lot of resources.
|
||||
* `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes.
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::{fmt::Display, str::FromStr};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::{Captures, Regex};
|
||||
use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId};
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
@@ -30,13 +30,13 @@ pub enum SubscriptionKind {
|
||||
/// Get every update in etcd.
|
||||
All,
|
||||
/// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
|
||||
TenantTimelines(ZTenantId),
|
||||
TenantTimelines(TenantId),
|
||||
/// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
|
||||
Timeline(ZTenantTimelineId),
|
||||
Timeline(TenantTimelineId),
|
||||
/// Get etcd timeline updates, specific to a certain node kind.
|
||||
Node(ZTenantTimelineId, NodeKind),
|
||||
Node(TenantTimelineId, NodeKind),
|
||||
/// Get etcd timeline updates for a certain operation on specific nodes.
|
||||
Operation(ZTenantTimelineId, NodeKind, OperationKind),
|
||||
Operation(TenantTimelineId, NodeKind, OperationKind),
|
||||
}
|
||||
|
||||
/// All kinds of nodes, able to write into etcd.
|
||||
@@ -67,7 +67,7 @@ static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
/// No other etcd keys are considered during system's work.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct SubscriptionFullKey {
|
||||
pub id: ZTenantTimelineId,
|
||||
pub id: TenantTimelineId,
|
||||
pub node_kind: NodeKind,
|
||||
pub operation: OperationKind,
|
||||
pub node_id: NodeId,
|
||||
@@ -83,7 +83,7 @@ impl SubscriptionKey {
|
||||
}
|
||||
|
||||
/// Subscribes to a given timeline info updates from safekeepers.
|
||||
pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||
pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self {
|
||||
Self {
|
||||
cluster_prefix,
|
||||
kind: SubscriptionKind::Operation(
|
||||
@@ -97,7 +97,7 @@ impl SubscriptionKey {
|
||||
/// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
|
||||
pub fn operation(
|
||||
cluster_prefix: String,
|
||||
timeline: ZTenantTimelineId,
|
||||
timeline: TenantTimelineId,
|
||||
node_kind: NodeKind,
|
||||
operation: OperationKind,
|
||||
) -> Self {
|
||||
@@ -175,7 +175,7 @@ impl FromStr for SubscriptionFullKey {
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
id: ZTenantTimelineId::new(
|
||||
id: TenantTimelineId::new(
|
||||
parse_capture(&key_captures, 1)?,
|
||||
parse_capture(&key_captures, 2)?,
|
||||
),
|
||||
@@ -247,7 +247,7 @@ impl FromStr for SkOperationKind {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::zid::ZTimelineId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -256,9 +256,9 @@ mod tests {
|
||||
let prefix = "neon";
|
||||
let node_kind = NodeKind::Safekeeper;
|
||||
let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
|
||||
let tenant_id = ZTenantId::generate();
|
||||
let timeline_id = ZTimelineId::generate();
|
||||
let id = ZTenantTimelineId::new(tenant_id, timeline_id);
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let id = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
let node_id = NodeId(1);
|
||||
|
||||
let timeline_subscription_keys = [
|
||||
|
||||
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.9"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
wal_craft = { path = "wal_craft" }
|
||||
|
||||
[build-dependencies]
|
||||
|
||||
@@ -11,6 +11,6 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres_ffi = { path = "../" }
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -344,6 +344,8 @@ impl Debug for S3Config {
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension,
|
||||
/// or if there's no extension, creates one and puts a suffix there.
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
@@ -468,6 +470,11 @@ mod tests {
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar/dir/");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar/dir..temp"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -21,6 +21,8 @@ use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
/// Convert a Path in the remote storage into a RemoteObjectId
|
||||
fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
|
||||
Ok(RemoteObjectId(
|
||||
@@ -143,7 +145,8 @@ impl RemoteStorage for LocalFs {
|
||||
// We need this dance with sort of durable rename (without fsyncs)
|
||||
// to prevent partial uploads. This was really hit when pageserver shutdown
|
||||
// cancelled the upload and partial file was left on the fs
|
||||
let temp_file_path = path_with_suffix_extension(&target_file_path, "temp");
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
|
||||
@@ -4,18 +4,20 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
routerify = "3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["macros"]}
|
||||
tokio-rustls = "0.23"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
nix = "0.23.0"
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use utils::zid;
|
||||
use utils::id;
|
||||
|
||||
pub fn bench_zid_stringify(c: &mut Criterion) {
|
||||
// Can only use public methods.
|
||||
let ztl = zid::ZTenantTimelineId::generate();
|
||||
let ztl = id::TenantTimelineId::generate();
|
||||
|
||||
c.bench_function("zid.to_string", |b| {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -14,7 +14,7 @@ use jsonwebtoken::{
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::zid::ZTenantId;
|
||||
use crate::id::TenantId;
|
||||
|
||||
const JWT_ALGORITHM: Algorithm = Algorithm::RS256;
|
||||
|
||||
@@ -30,23 +30,23 @@ pub enum Scope {
|
||||
pub struct Claims {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<ZTenantId>,
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub scope: Scope,
|
||||
}
|
||||
|
||||
impl Claims {
|
||||
pub fn new(tenant_id: Option<ZTenantId>, scope: Scope) -> Self {
|
||||
pub fn new(tenant_id: Option<TenantId>, scope: Scope) -> Self {
|
||||
Self { tenant_id, scope }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_permission(claims: &Claims, tenantid: Option<ZTenantId>) -> Result<()> {
|
||||
match (&claims.scope, tenantid) {
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
|
||||
match (&claims.scope, tenant_id) {
|
||||
(Scope::Tenant, None) => {
|
||||
bail!("Attempt to access management api with tenant scope. Permission denied")
|
||||
}
|
||||
(Scope::Tenant, Some(tenantid)) => {
|
||||
if claims.tenant_id.unwrap() != tenantid {
|
||||
(Scope::Tenant, Some(tenant_id)) => {
|
||||
if claims.tenant_id.unwrap() != tenant_id {
|
||||
bail!("Tenant id mismatch. Permission denied")
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::auth::{self, Claims, JwtAuth};
|
||||
use crate::http::error;
|
||||
use crate::zid::ZTenantId;
|
||||
use crate::id::TenantId;
|
||||
use anyhow::anyhow;
|
||||
use hyper::header::AUTHORIZATION;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
@@ -137,9 +137,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
})
|
||||
}
|
||||
|
||||
pub fn check_permission(req: &Request<Body>, tenantid: Option<ZTenantId>) -> Result<(), ApiError> {
|
||||
pub fn check_permission(req: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
|
||||
match req.context::<Claims>() {
|
||||
Some(claims) => Ok(auth::check_permission(&claims, tenantid)
|
||||
Some(claims) => Ok(auth::check_permission(&claims, tenant_id)
|
||||
.map_err(|err| ApiError::Forbidden(err.to_string()))?),
|
||||
None => Ok(()), // claims is None because auth is disabled
|
||||
}
|
||||
|
||||
@@ -3,6 +3,6 @@ pub mod error;
|
||||
pub mod json;
|
||||
pub mod request;
|
||||
|
||||
/// Current fast way to apply simple http routing in various Zenith binaries.
|
||||
/// Current fast way to apply simple http routing in various Neon binaries.
|
||||
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
|
||||
pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
|
||||
|
||||
@@ -4,7 +4,7 @@ use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Zenith ID is a 128-bit random ID.
|
||||
/// Neon ID is a 128-bit random ID.
|
||||
/// Used to represent various identifiers. Provides handy utility methods and impls.
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
@@ -13,13 +13,13 @@ use serde::{Deserialize, Serialize};
|
||||
/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
|
||||
/// Check the `serde_with::serde_as` documentation for options for more complex types.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
struct ZId([u8; 16]);
|
||||
struct Id([u8; 16]);
|
||||
|
||||
impl ZId {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId {
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
ZId::from(arr)
|
||||
Id::from(arr)
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
@@ -29,7 +29,7 @@ impl ZId {
|
||||
pub fn generate() -> Self {
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
ZId::from(tli_buf)
|
||||
Id::from(tli_buf)
|
||||
}
|
||||
|
||||
fn hex_encode(&self) -> String {
|
||||
@@ -44,54 +44,54 @@ impl ZId {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for ZId {
|
||||
impl FromStr for Id {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<ZId, Self::Err> {
|
||||
fn from_str(s: &str) -> Result<Id, Self::Err> {
|
||||
Self::from_hex(s)
|
||||
}
|
||||
}
|
||||
|
||||
// this is needed for pretty serialization and deserialization of ZId's using serde integration with hex crate
|
||||
impl FromHex for ZId {
|
||||
// this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate
|
||||
impl FromHex for Id {
|
||||
type Error = hex::FromHexError;
|
||||
|
||||
fn from_hex<T: AsRef<[u8]>>(hex: T) -> Result<Self, Self::Error> {
|
||||
let mut buf: [u8; 16] = [0u8; 16];
|
||||
hex::decode_to_slice(hex, &mut buf)?;
|
||||
Ok(ZId(buf))
|
||||
Ok(Id(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ZId {
|
||||
impl AsRef<[u8]> for Id {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 16]> for ZId {
|
||||
impl From<[u8; 16]> for Id {
|
||||
fn from(b: [u8; 16]) -> Self {
|
||||
ZId(b)
|
||||
Id(b)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZId {
|
||||
impl fmt::Display for Id {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.hex_encode())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ZId {
|
||||
impl fmt::Debug for Id {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.hex_encode())
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! zid_newtype {
|
||||
macro_rules! id_newtype {
|
||||
($t:ident) => {
|
||||
impl $t {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t {
|
||||
$t(ZId::get_from_buf(buf))
|
||||
$t(Id::get_from_buf(buf))
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
@@ -99,11 +99,11 @@ macro_rules! zid_newtype {
|
||||
}
|
||||
|
||||
pub fn generate() -> Self {
|
||||
$t(ZId::generate())
|
||||
$t(Id::generate())
|
||||
}
|
||||
|
||||
pub const fn from_array(b: [u8; 16]) -> Self {
|
||||
$t(ZId(b))
|
||||
$t(Id(b))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,14 +111,14 @@ macro_rules! zid_newtype {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<$t, Self::Err> {
|
||||
let value = ZId::from_str(s)?;
|
||||
let value = Id::from_str(s)?;
|
||||
Ok($t(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 16]> for $t {
|
||||
fn from(b: [u8; 16]) -> Self {
|
||||
$t(ZId::from(b))
|
||||
$t(Id::from(b))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ macro_rules! zid_newtype {
|
||||
type Error = hex::FromHexError;
|
||||
|
||||
fn from_hex<T: AsRef<[u8]>>(hex: T) -> Result<Self, Self::Error> {
|
||||
Ok($t(ZId::from_hex(hex)?))
|
||||
Ok($t(Id::from_hex(hex)?))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ macro_rules! zid_newtype {
|
||||
};
|
||||
}
|
||||
|
||||
/// Zenith timeline IDs are different from PostgreSQL timeline
|
||||
/// Neon timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
@@ -158,7 +158,7 @@ macro_rules! zid_newtype {
|
||||
/// timeline history. Those limitations mean that we cannot generate a
|
||||
/// new PostgreSQL timeline ID by just generating a random number. And
|
||||
/// that in turn is problematic for the "pull/push" workflow, where you
|
||||
/// have a local copy of a zenith repository, and you periodically sync
|
||||
/// have a local copy of a Neon repository, and you periodically sync
|
||||
/// the local changes with a remote server. When you work "detached"
|
||||
/// from the remote server, you cannot create a PostgreSQL timeline ID
|
||||
/// that's guaranteed to be different from all existing timelines in
|
||||
@@ -168,55 +168,55 @@ macro_rules! zid_newtype {
|
||||
/// branches? If they pick the same one, and later try to push the
|
||||
/// branches to the same remote server, they will get mixed up.
|
||||
///
|
||||
/// To avoid those issues, Zenith has its own concept of timelines that
|
||||
/// To avoid those issues, Neon has its own concept of timelines that
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// limitations. A Neon timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`ZId`] for alternative ways to serialize it.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId(ZId);
|
||||
pub struct TimelineId(Id);
|
||||
|
||||
zid_newtype!(ZTimelineId);
|
||||
id_newtype!(TimelineId);
|
||||
|
||||
/// Zenith Tenant Id represents identifiar of a particular tenant.
|
||||
/// Neon Tenant Id represents identifiar of a particular tenant.
|
||||
/// Is used for distinguishing requests and data belonging to different users.
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`ZId`] for alternative ways to serialize it.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
pub struct ZTenantId(ZId);
|
||||
pub struct TenantId(Id);
|
||||
|
||||
zid_newtype!(ZTenantId);
|
||||
id_newtype!(TenantId);
|
||||
|
||||
// A pair uniquely identifying Zenith instance.
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTenantTimelineId {
|
||||
pub tenant_id: ZTenantId,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub struct TenantTimelineId {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
impl ZTenantTimelineId {
|
||||
pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self {
|
||||
ZTenantTimelineId {
|
||||
impl TenantTimelineId {
|
||||
pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate() -> Self {
|
||||
Self::new(ZTenantId::generate(), ZTimelineId::generate())
|
||||
Self::new(TenantId::generate(), TimelineId::generate())
|
||||
}
|
||||
|
||||
pub fn empty() -> Self {
|
||||
Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16]))
|
||||
Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16]))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZTenantTimelineId {
|
||||
impl fmt::Display for TenantTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}/{}", self.tenant_id, self.timeline_id)
|
||||
}
|
||||
@@ -14,11 +14,9 @@ pub mod simple_rcu;
|
||||
/// append only ordered map implemented with a Vec
|
||||
pub mod vec_map;
|
||||
|
||||
// Async version of SeqWait. Currently unused.
|
||||
// pub mod seqwait_async;
|
||||
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod postgres_backend_async;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
@@ -31,7 +29,7 @@ pub mod crashsafe_dir;
|
||||
pub mod auth;
|
||||
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod zid;
|
||||
pub mod id;
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ pub enum AuthType {
|
||||
Trust,
|
||||
MD5,
|
||||
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
|
||||
ZenithJWT,
|
||||
NeonJWT,
|
||||
}
|
||||
|
||||
impl FromStr for AuthType {
|
||||
@@ -73,8 +73,8 @@ impl FromStr for AuthType {
|
||||
match s {
|
||||
"Trust" => Ok(Self::Trust),
|
||||
"MD5" => Ok(Self::MD5),
|
||||
"ZenithJWT" => Ok(Self::ZenithJWT),
|
||||
_ => bail!("invalid value \"{}\" for auth type", s),
|
||||
"NeonJWT" => Ok(Self::NeonJWT),
|
||||
_ => bail!("invalid value \"{s}\" for auth type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -84,7 +84,7 @@ impl fmt::Display for AuthType {
|
||||
f.write_str(match self {
|
||||
AuthType::Trust => "Trust",
|
||||
AuthType::MD5 => "MD5",
|
||||
AuthType::ZenithJWT => "ZenithJWT",
|
||||
AuthType::NeonJWT => "NeonJWT",
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -376,7 +376,7 @@ impl PostgresBackend {
|
||||
))?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
AuthType::ZenithJWT => {
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
@@ -403,7 +403,7 @@ impl PostgresBackend {
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
}
|
||||
AuthType::ZenithJWT => {
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
|
||||
485
libs/utils/src/postgres_backend_async.rs
Normal file
@@ -0,0 +1,485 @@
|
||||
//! Server-side asynchronous Postgres connection, as limited as we need.
|
||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend::AuthType;
|
||||
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use rand::Rng;
|
||||
use std::future::Future;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use tracing::{debug, error, trace};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care).
|
||||
async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth md5
|
||||
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
|
||||
bail!("MD5 auth failed")
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
||||
bail!("JWT auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
/// PostgresBackend protocol state.
|
||||
/// XXX: The order of the constructors matters.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
|
||||
pub enum ProtoState {
|
||||
Initialization,
|
||||
Encrypted,
|
||||
Authentication,
|
||||
Established,
|
||||
Closed,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ProcessMsgResult {
|
||||
Continue,
|
||||
Break,
|
||||
}
|
||||
|
||||
/// Always-writeable sock_split stream.
|
||||
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
||||
pub enum Stream {
|
||||
Unencrypted(tokio::net::TcpStream),
|
||||
Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl AsyncWrite for Stream {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, std::io::Error>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn poll_flush(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl AsyncRead for Stream {
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut tokio::io::ReadBuf<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackend {
|
||||
stream: Stream,
|
||||
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
|
||||
buf_out: BytesMut,
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
md5_salt: [u8; 4],
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
pub tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
}
|
||||
|
||||
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
let mut query_string = query_string.to_vec();
|
||||
if let Some(ch) = query_string.last() {
|
||||
if *ch == 0 {
|
||||
query_string.pop();
|
||||
}
|
||||
}
|
||||
query_string
|
||||
}
|
||||
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
impl PostgresBackend {
|
||||
pub fn new(
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> std::io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
Ok(Self {
|
||||
stream: Stream::Unencrypted(socket),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
md5_salt: [0u8; 4],
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_peer_addr(&self) -> &SocketAddr {
|
||||
&self.peer_addr
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is closed.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
use ProtoState::*;
|
||||
match self.state {
|
||||
Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
|
||||
Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
|
||||
Closed => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush output buffer into the socket.
|
||||
pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
|
||||
self.stream.write_all(&self.buf_out).await?;
|
||||
self.buf_out.clear();
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer.
|
||||
pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
|
||||
BeMessage::write(&mut self.buf_out, message)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self.run_message_loop(handler, shutdown_watcher).await;
|
||||
let _ = self.stream.shutdown();
|
||||
ret
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<()>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
tokio::select!(
|
||||
biased;
|
||||
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Ok(())
|
||||
},
|
||||
|
||||
result = async {
|
||||
while self.state < ProtoState::Established {
|
||||
if let Some(msg) = self.read_message().await? {
|
||||
trace!("got message {msg:?} during handshake");
|
||||
|
||||
match self.process_handshake_message(handler, msg).await? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => {
|
||||
trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Ok::<(), anyhow::Error>(())
|
||||
} => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
}
|
||||
);
|
||||
|
||||
// Authentication completed
|
||||
let mut query_string = Bytes::new();
|
||||
while let Some(msg) = tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
Ok(None)
|
||||
},
|
||||
msg = self.read_message() => { msg },
|
||||
)? {
|
||||
trace!("got message {:?}", msg);
|
||||
|
||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||
self.flush().await?;
|
||||
match result? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => break,
|
||||
}
|
||||
}
|
||||
|
||||
trace!("postgres backend to {:?} exited", self.peer_addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_tls(&mut self) -> anyhow::Result<()> {
|
||||
if let Stream::Unencrypted(plain_stream) =
|
||||
std::mem::replace(&mut self.stream, Stream::Broken)
|
||||
{
|
||||
let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap());
|
||||
let tls_stream = acceptor.accept(plain_stream).await?;
|
||||
|
||||
self.stream = Stream::Tls(Box::new(tls_stream));
|
||||
return Ok(());
|
||||
};
|
||||
bail!("TLS already started");
|
||||
}
|
||||
|
||||
async fn process_handshake_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
) -> Result<ProcessMsgResult> {
|
||||
assert!(self.state < ProtoState::Established);
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
FeMessage::StartupPacket(m) => {
|
||||
trace!("got startup message {m:?}");
|
||||
|
||||
match m {
|
||||
FeStartupPacket::SslRequest => {
|
||||
debug!("SSL requested");
|
||||
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
|
||||
if have_tls {
|
||||
self.start_tls().await?;
|
||||
self.state = ProtoState::Encrypted;
|
||||
}
|
||||
}
|
||||
FeStartupPacket::GssEncRequest => {
|
||||
debug!("GSS requested");
|
||||
self.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
|
||||
bail!("client did not connect with TLS");
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
// to bypass auth for new users.
|
||||
handler.startup(self, &m)?;
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => {
|
||||
self.write_message(&BeMessage::AuthenticationOk)?
|
||||
.write_message(&BeParameterStatusMessage::encoding())?
|
||||
// The async python driver requires a valid server_version
|
||||
.write_message(&BeMessage::ParameterStatus(
|
||||
BeParameterStatusMessage::ServerVersion("14.1"),
|
||||
))?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::MD5 => {
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.write_message(&BeMessage::AuthenticationMD5Password(
|
||||
self.md5_salt,
|
||||
))?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
}
|
||||
}
|
||||
FeStartupPacket::CancelRequest { .. } => {
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FeMessage::PasswordMessage(m) => {
|
||||
trace!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.state == ProtoState::Authentication);
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => unreachable!(),
|
||||
AuthType::MD5 => {
|
||||
let (_, md5_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_md5(self, md5_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.write_message(&BeMessage::AuthenticationOk)?
|
||||
.write_message(&BeParameterStatusMessage::encoding())?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
|
||||
_ => {
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
async fn process_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
assert!(self.state == ProtoState::Established);
|
||||
|
||||
match msg {
|
||||
FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
// ":?" uses the alternate formatting style, which makes anyhow display the
|
||||
// full cause of the error, not just the top-level context + its trace.
|
||||
// We don't want to send that in the ErrorResponse though,
|
||||
// because it's not relevant to the compute node logs.
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
// TODO: untangle convoluted control flow
|
||||
if e.to_string().contains("failed to run") {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Parse(m) => {
|
||||
*unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Describe(_) => {
|
||||
self.write_message(&BeMessage::ParameterDescription)?
|
||||
.write_message(&BeMessage::NoData)?;
|
||||
}
|
||||
|
||||
FeMessage::Bind(_) => {
|
||||
self.write_message(&BeMessage::BindComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Close(_) => {
|
||||
self.write_message(&BeMessage::CloseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
// ReadyForQuery message and backend just switches back to
|
||||
// processing mode after sending CopyDone or ErrorResponse.
|
||||
}
|
||||
|
||||
FeMessage::Sync => {
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Terminate => {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
|
||||
bail!("unexpected message type: {:?}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
}
|
||||
@@ -4,9 +4,10 @@ use std::cmp::{Eq, Ordering, PartialOrd};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
@@ -141,10 +142,10 @@ where
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
|
||||
pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
|
||||
match self.queue_for_wait(num) {
|
||||
Ok(None) => Ok(()),
|
||||
Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown),
|
||||
Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
@@ -156,13 +157,18 @@ where
|
||||
///
|
||||
/// If that hasn't happened after the specified timeout duration,
|
||||
/// [`SeqWaitError::Timeout`] will be returned.
|
||||
pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> {
|
||||
pub async fn wait_for_timeout(
|
||||
&self,
|
||||
num: V,
|
||||
timeout_duration: Duration,
|
||||
) -> Result<(), SeqWaitError> {
|
||||
match self.queue_for_wait(num) {
|
||||
Ok(None) => Ok(()),
|
||||
Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e {
|
||||
std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout,
|
||||
std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown,
|
||||
}),
|
||||
Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(_)) => Err(SeqWaitError::Shutdown),
|
||||
Err(_) => Err(SeqWaitError::Timeout),
|
||||
},
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
@@ -179,7 +185,7 @@ where
|
||||
}
|
||||
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel();
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
@@ -235,7 +241,6 @@ mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
use std::thread::spawn;
|
||||
use std::time::Duration;
|
||||
|
||||
impl MonotonicCounter<i32> for i32 {
|
||||
@@ -248,25 +253,25 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seqwait() {
|
||||
#[tokio::test]
|
||||
async fn seqwait() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
spawn(move || {
|
||||
seq2.wait_for(42).expect("wait_for 42");
|
||||
tokio::task::spawn(async move {
|
||||
seq2.wait_for(42).await.expect("wait_for 42");
|
||||
let old = seq2.advance(100);
|
||||
assert_eq!(old, 99);
|
||||
seq2.wait_for(999).expect_err("no 999");
|
||||
seq2.wait_for(999).await.expect_err("no 999");
|
||||
});
|
||||
spawn(move || {
|
||||
seq3.wait_for(42).expect("wait_for 42");
|
||||
seq3.wait_for(0).expect("wait_for 0");
|
||||
tokio::task::spawn(async move {
|
||||
seq3.wait_for(42).await.expect("wait_for 42");
|
||||
seq3.wait_for(0).await.expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1));
|
||||
let old = seq.advance(99);
|
||||
assert_eq!(old, 0);
|
||||
seq.wait_for(100).expect("wait_for 100");
|
||||
seq.wait_for(100).await.expect("wait_for 100");
|
||||
|
||||
// Calling advance with a smaller value is a no-op
|
||||
assert_eq!(seq.advance(98), 100);
|
||||
@@ -275,16 +280,16 @@ mod tests {
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seqwait_timeout() {
|
||||
#[tokio::test]
|
||||
async fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
spawn(move || {
|
||||
tokio::task::spawn(async move {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout);
|
||||
let res = seq2.wait_for_timeout(42, timeout).await;
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
sleep(Duration::from_secs(1));
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
let old = seq.advance(99);
|
||||
|
||||
@@ -1,224 +0,0 @@
|
||||
//!
|
||||
//! Async version of 'seqwait.rs'
|
||||
//!
|
||||
//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
|
||||
//!
|
||||
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
#[error("SeqWaitError")]
|
||||
pub enum SeqWaitError {
|
||||
/// The wait timeout was reached
|
||||
Timeout,
|
||||
/// [`SeqWait::shutdown`] was called
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
waiters: BTreeMap<T, (Sender<()>, Receiver<()>)>,
|
||||
current: T,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to await the arrival of a number.
|
||||
/// As soon as the number arrives by another caller calling
|
||||
/// [`advance`], then the waiter will be woken up.
|
||||
///
|
||||
/// This implementation takes a blocking Mutex on both [`wait_for`]
|
||||
/// and [`advance`], meaning there may be unexpected executor blocking
|
||||
/// due to thread scheduling unfairness. There are probably better
|
||||
/// implementations, but we can probably live with this for now.
|
||||
///
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
pub struct SeqWait<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
internal: Mutex<SeqWaitInt<T>>,
|
||||
}
|
||||
|
||||
impl<T> SeqWait<T>
|
||||
where
|
||||
T: Ord + Debug + Copy,
|
||||
{
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: T) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BTreeMap::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
SeqWait {
|
||||
internal: Mutex::new(internal),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down a `SeqWait`, causing all waiters (present and
|
||||
/// future) to return an error.
|
||||
pub fn shutdown(&self) {
|
||||
let waiters = {
|
||||
// Prevent new waiters; wake all those that exist.
|
||||
// Wake everyone with an error.
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
|
||||
// When we drop the waiters list, each Receiver will
|
||||
// be woken with an error.
|
||||
// This drop doesn't need to be explicit; it's done
|
||||
// here to make it easier to read the code and understand
|
||||
// the order of events.
|
||||
drop(waiters);
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
|
||||
let mut rx = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current >= num {
|
||||
return Ok(());
|
||||
}
|
||||
if internal.shutdown {
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// If we already have a channel for waiting on this number, reuse it.
|
||||
if let Some((_, rx)) = internal.waiters.get_mut(&num) {
|
||||
// an Err from changed() means the sender was dropped.
|
||||
rx.clone()
|
||||
} else {
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.insert(num, (tx, rx.clone()));
|
||||
rx
|
||||
}
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
rx.changed().await.map_err(|_| SeqWaitError::Shutdown)
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
///
|
||||
/// If that hasn't happened after the specified timeout duration,
|
||||
/// [`SeqWaitError::Timeout`] will be returned.
|
||||
pub async fn wait_for_timeout(
|
||||
&self,
|
||||
num: T,
|
||||
timeout_duration: Duration,
|
||||
) -> Result<(), SeqWaitError> {
|
||||
timeout(timeout_duration, self.wait_for(num))
|
||||
.await
|
||||
.unwrap_or(Err(SeqWaitError::Timeout))
|
||||
}
|
||||
|
||||
/// Announce a new number has arrived
|
||||
///
|
||||
/// All waiters at this value or below will be woken.
|
||||
///
|
||||
/// `advance` will panic if you send it a lower number than
|
||||
/// a previous call.
|
||||
pub fn advance(&self, num: T) {
|
||||
let wake_these = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
if internal.current > num {
|
||||
panic!(
|
||||
"tried to advance backwards, from {:?} to {:?}",
|
||||
internal.current, num
|
||||
);
|
||||
}
|
||||
internal.current = num;
|
||||
|
||||
// split_off will give me all the high-numbered waiters,
|
||||
// so split and then swap. Everything at or above `num`
|
||||
// stays.
|
||||
let mut split = internal.waiters.split_off(&num);
|
||||
std::mem::swap(&mut split, &mut internal.waiters);
|
||||
|
||||
// `split_at` didn't get the value at `num`; if it's
|
||||
// there take that too.
|
||||
if let Some(sleeper) = internal.waiters.remove(&num) {
|
||||
split.insert(num, sleeper);
|
||||
}
|
||||
|
||||
split
|
||||
};
|
||||
|
||||
for (_wake_num, (tx, _rx)) in wake_these {
|
||||
// This can fail if there are no receivers.
|
||||
// We don't care; discard the error.
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
seq2.wait_for(42).await.expect("wait_for 42");
|
||||
seq2.advance(100);
|
||||
seq2.wait_for(999).await.expect_err("no 999");
|
||||
});
|
||||
tokio::spawn(async move {
|
||||
seq3.wait_for(42).await.expect("wait_for 42");
|
||||
seq3.wait_for(0).await.expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
seq.advance(99);
|
||||
seq.wait_for(100).await.expect("wait_for 100");
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout).await;
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
seq.advance(99);
|
||||
}
|
||||
}
|
||||
@@ -12,6 +12,8 @@ profiling = ["pprof"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
@@ -24,10 +26,11 @@ itertools = "0.10.3"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
thiserror = "1.0"
|
||||
@@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
tracing = "0.1.36"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
|
||||
@@ -22,8 +22,8 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName};
|
||||
@@ -81,9 +81,8 @@ where
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", req_lsn);
|
||||
timeline.wait_lsn(req_lsn)?;
|
||||
// Backup was requested at a particular LSN. The caller should've
|
||||
// already checked that it's a valid LSN.
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
//! A handy tool for debugging, that's all.
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::page_cache;
|
||||
use pageserver::tenant::dump_layerfile_from_path;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::project_git_version;
|
||||
@@ -12,7 +12,7 @@ use utils::project_git_version;
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
let arg_matches = App::new("Neon dump_layerfile utility")
|
||||
.about("Dump contents of one layer file, for debugging")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
|
||||
@@ -4,7 +4,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
use tracing::*;
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
@@ -12,13 +12,15 @@ use daemonize::Daemonize;
|
||||
use fail::FailScenario;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
virtual_file, LOG_FILE_NAME,
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
tenant_mgr, virtual_file, LOG_FILE_NAME,
|
||||
};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::endpoint,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
@@ -38,7 +40,7 @@ fn version() -> String {
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
let arg_matches = App::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(&*version())
|
||||
.arg(
|
||||
@@ -180,7 +182,7 @@ fn initialize_config(
|
||||
cfg_file_path.display()
|
||||
);
|
||||
} else {
|
||||
// We're initializing the repo, so there's no config file yet
|
||||
// We're initializing the tenant, so there's no config file yet
|
||||
(
|
||||
DEFAULT_CONFIG_FILE
|
||||
.parse::<toml_edit::Document>()
|
||||
@@ -286,12 +288,12 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
|
||||
pageserver::tenant_tasks::init_tenant_task_pool()?;
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
AuthType::ZenithJWT => {
|
||||
AuthType::NeonJWT => {
|
||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||
Some(JwtAuth::from_key_path(key_path)?.into())
|
||||
@@ -307,35 +309,54 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
let remote_index = {
|
||||
let _rt_guard = BACKGROUND_RUNTIME.enter();
|
||||
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
|
||||
};
|
||||
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?;
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let auth_cloned = auth.clone();
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::HttpEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?;
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
|
||||
// Spawn a thread to listen for libpq connections. It will spawn further threads
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?;
|
||||
let service =
|
||||
utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
||||
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::HttpEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"http endpoint listener",
|
||||
true,
|
||||
async {
|
||||
server.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection.
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::LibpqEndpointListener,
|
||||
task_mgr::spawn(
|
||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::LibpqEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint thread",
|
||||
"libpq endpoint listener",
|
||||
true,
|
||||
move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
|
||||
)?;
|
||||
async move {
|
||||
page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
|
||||
},
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
@@ -352,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
pageserver::shutdown_pageserver(0);
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//! A handy tool for debugging, that's all.
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::layered_repository::metadata::TimelineMetadata;
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
@@ -11,7 +11,7 @@ use utils::{lsn::Lsn, project_git_version};
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith update metadata utility")
|
||||
let arg_matches = App::new("Neon update metadata utility")
|
||||
.about("Dump or update metadata file")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
|
||||
@@ -15,11 +15,11 @@ use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
postgres_backend::AuthType,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
pub mod defaults {
|
||||
@@ -342,16 +342,16 @@ impl PageServerConf {
|
||||
self.workdir.join("tenants")
|
||||
}
|
||||
|
||||
pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.tenants_path().join(tenantid.to_string())
|
||||
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
|
||||
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timelines_path(tenantid).join(timelineid.to_string())
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
|
||||
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
@@ -435,7 +435,7 @@ impl PageServerConf {
|
||||
|
||||
let mut conf = builder.build().context("invalid config")?;
|
||||
|
||||
if conf.auth_type == AuthType::ZenithJWT {
|
||||
if conf.auth_type == AuthType::NeonJWT {
|
||||
let auth_validation_public_key_path = conf
|
||||
.auth_validation_public_key_path
|
||||
.get_or_insert_with(|| workdir.join("auth_public_key.pem"));
|
||||
|
||||
@@ -3,22 +3,21 @@ use std::num::NonZeroU64;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
// These enums are used in the API response fields.
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::tenant::TenantState;
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_timeline_id: Option<ZTimelineId>,
|
||||
pub new_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<ZTimelineId>,
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
@@ -29,7 +28,7 @@ pub struct TimelineCreateRequest {
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_tenant_id: Option<ZTenantId>,
|
||||
pub new_tenant_id: Option<TenantId>,
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub checkpoint_timeout: Option<String>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
@@ -48,7 +47,7 @@ pub struct TenantCreateRequest {
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId);
|
||||
pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
@@ -56,7 +55,7 @@ pub struct StatusResponse {
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: Option<ZTenantId>) -> TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
..Default::default()
|
||||
@@ -67,7 +66,7 @@ impl TenantCreateRequest {
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantConfigRequest {
|
||||
pub tenant_id: ZTenantId,
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
@@ -85,7 +84,7 @@ pub struct TenantConfigRequest {
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest {
|
||||
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
||||
TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: None,
|
||||
@@ -108,8 +107,8 @@ impl TenantConfigRequest {
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: ZTenantId,
|
||||
pub state: Option<TenantState>,
|
||||
pub id: TenantId,
|
||||
pub state: TenantState,
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub has_in_progress_downloads: Option<bool>,
|
||||
}
|
||||
@@ -118,7 +117,7 @@ pub struct TenantInfo {
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct LocalTimelineInfo {
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<ZTimelineId>,
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
@@ -156,9 +155,9 @@ pub struct RemoteTimelineInfo {
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: ZTenantId,
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub local: Option<LocalTimelineInfo>,
|
||||
pub remote: Option<RemoteTimelineInfo>,
|
||||
}
|
||||
|
||||
@@ -489,11 +489,18 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- state
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
oneOf:
|
||||
- type: string
|
||||
- type: object
|
||||
properties:
|
||||
background_jobs_running:
|
||||
type: boolean
|
||||
|
||||
current_physical_size:
|
||||
type: integer
|
||||
has_in_progress_downloads:
|
||||
@@ -573,7 +580,6 @@ components:
|
||||
required:
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
- timeline_state
|
||||
properties:
|
||||
last_record_lsn:
|
||||
type: string
|
||||
@@ -581,8 +587,6 @@ components:
|
||||
disk_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
timeline_state:
|
||||
type: string
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
@@ -11,9 +11,9 @@ use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest,
|
||||
};
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant::{TenantState, Timeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use utils::{
|
||||
@@ -25,8 +25,8 @@ use utils::{
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
struct State {
|
||||
@@ -128,16 +128,15 @@ fn local_timeline_info_from_timeline(
|
||||
}
|
||||
|
||||
fn list_local_timelines(
|
||||
tenant_id: ZTenantId,
|
||||
tenant_id: TenantId,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
|
||||
.with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?;
|
||||
let repo_timelines = repo.list_timelines();
|
||||
) -> Result<Vec<(TimelineId, LocalTimelineInfo)>> {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
|
||||
for (timeline_id, repository_timeline) in repo_timelines {
|
||||
let mut local_timeline_info = Vec::with_capacity(timelines.len());
|
||||
for (timeline_id, repository_timeline) in timelines {
|
||||
local_timeline_info.push((
|
||||
timeline_id,
|
||||
local_timeline_info_from_timeline(
|
||||
@@ -157,20 +156,18 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
}
|
||||
|
||||
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
|
||||
|
||||
let new_timeline_info = async {
|
||||
match timelines::create_timeline(
|
||||
get_config(&request),
|
||||
tenant_id,
|
||||
request_data.new_timeline_id.map(ZTimelineId::from),
|
||||
request_data.ancestor_timeline_id.map(ZTimelineId::from),
|
||||
request_data.new_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
) {
|
||||
).await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?;
|
||||
@@ -184,9 +181,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
Ok(None) => Ok(None), // timeline already exists
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
}
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
Ok(match new_timeline_info {
|
||||
Some(info) => json_response(StatusCode::CREATED, info)?,
|
||||
@@ -195,35 +193,43 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
}
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let local_timeline_infos = tokio::task::spawn_blocking(move || {
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
list_local_timelines(
|
||||
tenant_id,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let mut response_data = Vec::with_capacity(local_timeline_infos.len());
|
||||
for (timeline_id, local_timeline_info) in local_timeline_infos {
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for (timeline_id, timeline) in timelines {
|
||||
let local = match local_timeline_info_from_timeline(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
) {
|
||||
Ok(local) => Some(local),
|
||||
Err(e) => {
|
||||
error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
response_data.push(TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
local: Some(local_timeline_info),
|
||||
local,
|
||||
remote: get_state(&request)
|
||||
.remote_index
|
||||
.read()
|
||||
.await
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
@@ -251,8 +257,8 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
@@ -260,33 +266,30 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
// any error here will render local timeline as None
|
||||
// XXX .in_current_span does not attach messages in spawn_blocking future to current future's span
|
||||
let local_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
local_timeline_info_from_timeline(
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
Ok::<_, anyhow::Error>(local_timeline)
|
||||
let timeline = tokio::task::spawn_blocking(move || {
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|r| r.ok())
|
||||
.flatten();
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
let local_timeline_info = match timeline.and_then(|timeline| {
|
||||
local_timeline_info_from_timeline(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
}) {
|
||||
Ok(local_info) => Some(local_info),
|
||||
Err(e) => {
|
||||
error!("Failed to get local timeline info: {e:#}");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
@@ -295,36 +298,37 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
})
|
||||
};
|
||||
(local_timeline_info, remote_timeline_info)
|
||||
Ok::<_, anyhow::Error>((local_timeline_info, remote_timeline_info))
|
||||
}
|
||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
.await?;
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
return Err(ApiError::NotFound(format!(
|
||||
Err(ApiError::NotFound(format!(
|
||||
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
||||
)));
|
||||
)))
|
||||
} else {
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
local: local_timeline_info,
|
||||
remote: remote_timeline_info,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
let timeline_info = TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
local: local_timeline_info,
|
||||
remote: remote_timeline_info,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
}
|
||||
|
||||
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
||||
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
info!("Handling tenant attach {}", tenant_id);
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
|
||||
if tenant_mgr::get_tenant(tenant_id, false).is_ok() {
|
||||
anyhow::bail!("Tenant is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
@@ -398,8 +402,8 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts`
|
||||
async fn gather_tenant_timelines_index_parts(
|
||||
state: &State,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Option<Vec<(TimelineId, RemoteTimeline)>>> {
|
||||
let index_parts = match state.remote_storage.as_ref() {
|
||||
Some(storage) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await
|
||||
@@ -421,20 +425,18 @@ async fn gather_tenant_timelines_index_parts(
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_delete", tenant = %tenant_id).entered();
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_timeline_entry(ZTenantTimelineId {
|
||||
remote_index.remove_timeline_entry(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
@@ -443,17 +445,15 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
|
||||
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered();
|
||||
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_tenant_entry(&tenant_id);
|
||||
@@ -470,7 +470,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::tenant_mgr::list_tenants(&remote_index)
|
||||
crate::tenant_mgr::list_tenant_info(&remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
@@ -479,11 +479,11 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
}
|
||||
|
||||
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id))
|
||||
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
@@ -499,13 +499,25 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
false
|
||||
});
|
||||
|
||||
let tenant_state = match tenant {
|
||||
Ok(tenant) => tenant.current_state(),
|
||||
Err(e) => {
|
||||
error!("Failed to get local tenant state: {e:#}");
|
||||
if has_in_progress_downloads {
|
||||
TenantState::Paused
|
||||
} else {
|
||||
TenantState::Broken
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let current_physical_size =
|
||||
match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?
|
||||
{
|
||||
Err(err) => {
|
||||
// Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded).
|
||||
// Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded).
|
||||
// In that case, put a warning message into log and operate normally.
|
||||
warn!("Failed to get local timelines for tenant {tenant_id}: {err}");
|
||||
None
|
||||
@@ -579,14 +591,14 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let target_tenant_id = request_data
|
||||
.new_tenant_id
|
||||
.map(ZTenantId::from)
|
||||
.unwrap_or_else(ZTenantId::generate);
|
||||
.map(TenantId::from)
|
||||
.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let new_tenant_id = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let conf = get_config(&request);
|
||||
|
||||
tenant_mgr::create_tenant_repository(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
tenant_mgr::create_tenant(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -643,7 +655,8 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
tenant_mgr::update_tenant_config(tenant_conf, tenant_id)
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//!
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a zenith Timeline.
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
@@ -11,9 +11,9 @@ use bytes::Bytes;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use postgres_ffi::v14::relfile_utils::*;
|
||||
|
||||
@@ -3,7 +3,6 @@ pub mod config;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub mod layered_repository;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
@@ -12,10 +11,11 @@ pub mod profiling;
|
||||
pub mod reltag;
|
||||
pub mod repository;
|
||||
pub mod storage_sync;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_tasks;
|
||||
pub mod thread_mgr;
|
||||
pub mod timelines;
|
||||
pub mod trace;
|
||||
pub mod virtual_file;
|
||||
@@ -24,9 +24,12 @@ pub mod walreceiver;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use tracing::info;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use tracing::info;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -50,30 +53,31 @@ pub enum CheckpointConfig {
|
||||
Forced,
|
||||
}
|
||||
|
||||
pub fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint thread. This prevents new connections from
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
||||
|
||||
// Shut down any page service threads.
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
|
||||
// Shut down any page service tasks.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC threads.
|
||||
tenant_mgr::shutdown_all_tenants();
|
||||
// the checkpoint and GC tasks.
|
||||
tenant_mgr::shutdown_all_tenants().await;
|
||||
|
||||
// Stop syncing with remote storage.
|
||||
//
|
||||
// FIXME: Does this wait for the sync thread to finish syncing what's queued up?
|
||||
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
||||
// Should it?
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
thread_mgr::shutdown_threads(None, None, None);
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
@@ -101,6 +105,50 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds
|
||||
}
|
||||
}
|
||||
|
||||
/// A newtype to store arbitrary data grouped by tenant and timeline ids.
|
||||
/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would
|
||||
/// not include the cases where a certain tenant has zero timelines.
|
||||
/// This is sometimes important: a tenant could be registered during initial load from FS,
|
||||
/// even if he has no timelines on disk.
|
||||
#[derive(Debug)]
|
||||
pub struct TenantTimelineValues<T>(HashMap<TenantId, HashMap<TimelineId, T>>);
|
||||
|
||||
impl<T> TenantTimelineValues<T> {
|
||||
fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self(HashMap::with_capacity(capacity))
|
||||
}
|
||||
|
||||
/// A convenience method to map certain values and omit some of them, if needed.
|
||||
/// Tenants that won't have any timeline entries due to the filtering, will still be preserved
|
||||
/// in the structure.
|
||||
fn filter_map<F, NewT>(self, map: F) -> TenantTimelineValues<NewT>
|
||||
where
|
||||
F: Fn(T) -> Option<NewT>,
|
||||
{
|
||||
let capacity = self.0.len();
|
||||
self.0.into_iter().fold(
|
||||
TenantTimelineValues::<NewT>::with_capacity(capacity),
|
||||
|mut new_values, (tenant_id, old_values)| {
|
||||
let new_timeline_values = new_values.0.entry(tenant_id).or_default();
|
||||
for (timeline_id, old_value) in old_values {
|
||||
if let Some(new_value) = map(old_value) {
|
||||
new_timeline_values.insert(timeline_id, new_value);
|
||||
}
|
||||
}
|
||||
new_values
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A suffix to be used during file sync from the remote storage,
|
||||
/// to ensure that we do not leave corrupted files that pretend to be layers.
|
||||
const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
@@ -131,3 +179,35 @@ mod backoff_defaults_tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tenant_timeline_value_mapping() {
|
||||
let first_tenant = TenantId::generate();
|
||||
let second_tenant = TenantId::generate();
|
||||
assert_ne!(first_tenant, second_tenant);
|
||||
|
||||
let mut initial = TenantTimelineValues::new();
|
||||
initial
|
||||
.0
|
||||
.entry(first_tenant)
|
||||
.or_default()
|
||||
.insert(TIMELINE_ID, "test_value");
|
||||
let _ = initial.0.entry(second_tenant).or_default();
|
||||
assert_eq!(initial.0.len(), 2, "Should have entries for both tenants");
|
||||
|
||||
let filtered = initial.filter_map(|_| None::<&str>).0;
|
||||
assert_eq!(
|
||||
filtered.len(),
|
||||
2,
|
||||
"Should have entries for both tenants even after filtering away all entries"
|
||||
);
|
||||
assert!(filtered.contains_key(&first_tenant));
|
||||
assert!(filtered.contains_key(&second_tenant));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use metrics::{
|
||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||
/// latencies in the microsecond range but also extend far enough up to distinguish
|
||||
@@ -327,7 +327,7 @@ pub struct TimelineMetrics {
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self {
|
||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
@@ -414,6 +414,6 @@ impl Drop for TimelineMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove_tenant_metrics(tenant_id: &ZTenantId) {
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
|
||||
}
|
||||
|
||||
@@ -49,12 +49,12 @@ use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::writeback_ephemeral_file;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::writeback_ephemeral_file;
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
@@ -109,8 +109,8 @@ enum CacheKey {
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
struct MaterializedPageHashKey {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
}
|
||||
|
||||
@@ -308,8 +308,8 @@ impl PageCache {
|
||||
/// returned page.
|
||||
pub fn lookup_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
@@ -338,8 +338,8 @@ impl PageCache {
|
||||
///
|
||||
pub fn memorize_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
|
||||
@@ -7,10 +7,10 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
@@ -570,7 +570,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: ZenithWalRecord,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
@@ -583,7 +583,7 @@ impl<'a> DatadirModification<'a> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: ZenithWalRecord,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<()> {
|
||||
self.put(
|
||||
slru_block_to_key(kind, segno, blknum),
|
||||
@@ -1398,16 +1398,12 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
//
|
||||
//-- Tests that should work the same with any Repository/Timeline implementation.
|
||||
//
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline(
|
||||
repo: &crate::layered_repository::Repository,
|
||||
timeline_id: utils::zid::ZTimelineId,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
timeline_id: utils::id::TimelineId,
|
||||
) -> Result<std::sync::Arc<Timeline>> {
|
||||
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::Bytes;
|
||||
@@ -157,7 +157,7 @@ pub enum Value {
|
||||
/// replayed get the full value. Replaying the WAL record
|
||||
/// might need a previous version of the value (if will_init()
|
||||
/// returns false), or it may be replayed stand-alone (true).
|
||||
WalRecord(ZenithWalRecord),
|
||||
WalRecord(NeonWalRecord),
|
||||
}
|
||||
|
||||
impl Value {
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
//! | access to this storage |
|
||||
//! +------------------------+
|
||||
//!
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
|
||||
//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so.
|
||||
//! The loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
|
||||
@@ -46,10 +46,10 @@
|
||||
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk.
|
||||
//! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//! See [`crate::tenant`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
|
||||
//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, submitted via [`crate::tenant_mgr::attach_local_tenants`] function.
|
||||
//! Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Such submissions happen in two cases:
|
||||
//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
|
||||
//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
|
||||
@@ -68,7 +68,7 @@
|
||||
//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
|
||||
//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files.
|
||||
//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
|
||||
//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
|
||||
//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`TenantId`] and [`TimelineId`],
|
||||
//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files.
|
||||
//!
|
||||
//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
|
||||
@@ -145,7 +145,6 @@ mod upload;
|
||||
|
||||
use std::{
|
||||
collections::{hash_map, HashMap, HashSet, VecDeque},
|
||||
ffi::OsStr,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
ops::ControlFlow,
|
||||
@@ -159,7 +158,6 @@ use once_cell::sync::OnceCell;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::{
|
||||
fs,
|
||||
runtime::Runtime,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tracing::*;
|
||||
@@ -170,244 +168,57 @@ use self::{
|
||||
index::{IndexPart, RemoteTimeline, RemoteTimelineIndex},
|
||||
upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
|
||||
};
|
||||
use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD};
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
exponential_backoff,
|
||||
layered_repository::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
|
||||
},
|
||||
storage_sync::{self, index::RemoteIndex},
|
||||
tenant_mgr::attach_downloaded_tenants,
|
||||
thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
storage_sync::index::RemoteIndex,
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
tenant::metadata::{metadata_path, TimelineMetadata},
|
||||
tenant_mgr::attach_local_tenants,
|
||||
};
|
||||
use crate::{
|
||||
metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
|
||||
TenantTimelineValues,
|
||||
};
|
||||
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use self::download::download_index_parts;
|
||||
pub use self::download::gather_tenant_timelines_index_parts;
|
||||
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
|
||||
|
||||
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
/// after comparing local and remote timeline state.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub enum LocalTimelineInitStatus {
|
||||
/// The timeline has every remote layer present locally.
|
||||
/// There could be some layers requiring uploading,
|
||||
/// but this does not block the timeline from any user interaction.
|
||||
LocallyComplete,
|
||||
LocallyComplete(TimelineMetadata),
|
||||
/// A timeline has some files remotely, that are not present locally and need downloading.
|
||||
/// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
|
||||
/// so the data needs to be downloaded first before the timeline can be used.
|
||||
NeedsSync,
|
||||
}
|
||||
|
||||
type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
|
||||
impl std::fmt::Debug for LocalTimelineInitStatus {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::LocallyComplete(_) => write!(f, "LocallyComplete"),
|
||||
Self::NeedsSync => write!(f, "NeedsSync"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
|
||||
/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
|
||||
/// to simplify the received code.
|
||||
pub struct SyncStartupData {
|
||||
pub remote_index: RemoteIndex,
|
||||
pub local_timeline_init_statuses: LocalTimelineInitStatuses,
|
||||
}
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
/// If no external configuration connection given, no thread or storage initialization is done.
|
||||
/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
|
||||
pub fn start_local_timeline_sync(
|
||||
config: &'static PageServerConf,
|
||||
storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<SyncStartupData> {
|
||||
let local_timeline_files = local_tenant_timeline_files(config)
|
||||
.context("Failed to collect local tenant timeline files")?;
|
||||
|
||||
match storage.zip(config.remote_storage_config.as_ref()) {
|
||||
Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
.context("Failed to spawn the storage sync thread"),
|
||||
None => {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
|
||||
for (
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
_,
|
||||
) in local_timeline_files
|
||||
{
|
||||
local_timeline_init_statuses
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
|
||||
}
|
||||
Ok(SyncStartupData {
|
||||
local_timeline_init_statuses,
|
||||
remote_index: RemoteIndex::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &tenants_dir_entry {
|
||||
Ok(tenants_dir_entry) => {
|
||||
match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
|
||||
Ok(collected_files) => {
|
||||
local_tenant_timeline_files.extend(collected_files.into_iter())
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
tenants_dir_entry,
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
tenants_dir_entry,
|
||||
tenants_dir.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(local_tenant_timeline_files)
|
||||
}
|
||||
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut timelines = HashMap::new();
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
let timelines_dir = config.timelines_path(&tenant_id);
|
||||
|
||||
for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to list timelines dir entry for tenant {}",
|
||||
tenant_id
|
||||
)
|
||||
})? {
|
||||
match timelines_dir_entry {
|
||||
Ok(timelines_dir_entry) => {
|
||||
let timeline_path = timelines_dir_entry.path();
|
||||
match collect_timeline_files(&timeline_path) {
|
||||
Ok((timeline_id, metadata, timeline_files)) => {
|
||||
timelines.insert(
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
(metadata, timeline_files),
|
||||
);
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to process timeline dir contents at '{}', reason: {:?}",
|
||||
timeline_path.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list timelines for entry tenant {}, reason: {:?}",
|
||||
tenant_id, e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
|
||||
// discover timeline files and extract timeline metadata
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
||||
let mut timeline_files = HashSet::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTimelineId>()
|
||||
.context("Could not parse timeline id out of the timeline dir name")?;
|
||||
let timeline_dir_entries =
|
||||
std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||
for entry in timeline_dir_entries {
|
||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||
timeline_metadata_path = Some(entry_path);
|
||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||
debug!("skipping ephemeral file {}", entry_path.display());
|
||||
continue;
|
||||
} else if entry_path.extension().and_then(OsStr::to_str)
|
||||
== Some(TEMP_DOWNLOAD_EXTENSION)
|
||||
{
|
||||
info!("removing temp download file at {}", entry_path.display());
|
||||
std::fs::remove_file(&entry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") {
|
||||
info!("removing temp layer file at {}", entry_path.display());
|
||||
std::fs::remove_file(&entry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp layer file at {}",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
timeline_files.insert(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
|
||||
// then attach is lost. There would be no retries for that,
|
||||
// initial collect will fail because there is no metadata.
|
||||
// We either need to start download if we see empty dir after restart or attach caller should
|
||||
// be aware of that and retry attach if awaits_download for timeline switched from true to false
|
||||
// but timelinne didn't appear locally.
|
||||
// Check what happens with remote index in that case.
|
||||
let timeline_metadata_path = match timeline_metadata_path {
|
||||
Some(path) => path,
|
||||
None => bail!("No metadata file found in the timeline directory"),
|
||||
};
|
||||
let metadata = TimelineMetadata::from_bytes(
|
||||
&std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
|
||||
)
|
||||
.context("Failed to parse timeline metadata file bytes")?;
|
||||
|
||||
Ok((timeline_id, metadata, timeline_files))
|
||||
pub local_timeline_init_statuses: TenantTimelineValues<LocalTimelineInitStatus>,
|
||||
}
|
||||
|
||||
/// Global queue of sync tasks.
|
||||
@@ -416,7 +227,7 @@ fn collect_timeline_files(
|
||||
struct SyncQueue {
|
||||
max_timelines_per_batch: NonZeroUsize,
|
||||
|
||||
queue: Mutex<VecDeque<(ZTenantTimelineId, SyncTask)>>,
|
||||
queue: Mutex<VecDeque<(TenantTimelineId, SyncTask)>>,
|
||||
condvar: Condvar,
|
||||
}
|
||||
|
||||
@@ -430,7 +241,7 @@ impl SyncQueue {
|
||||
}
|
||||
|
||||
/// Queue a new task
|
||||
fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) {
|
||||
fn push(&self, sync_id: TenantTimelineId, new_task: SyncTask) {
|
||||
let mut q = self.queue.lock().unwrap();
|
||||
|
||||
q.push_back((sync_id, new_task));
|
||||
@@ -443,7 +254,7 @@ impl SyncQueue {
|
||||
/// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen.
|
||||
/// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter.
|
||||
/// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading).
|
||||
fn next_task_batch(&self) -> (HashMap<ZTenantTimelineId, SyncTaskBatch>, usize) {
|
||||
fn next_task_batch(&self) -> (HashMap<TenantTimelineId, SyncTaskBatch>, usize) {
|
||||
// Wait for the first task in blocking fashion
|
||||
let mut q = self.queue.lock().unwrap();
|
||||
while q.is_empty() {
|
||||
@@ -453,7 +264,7 @@ impl SyncQueue {
|
||||
.unwrap()
|
||||
.0;
|
||||
|
||||
if thread_mgr::is_shutdown_requested() {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
return (HashMap::new(), q.len());
|
||||
}
|
||||
}
|
||||
@@ -677,8 +488,8 @@ struct LayersDeletion {
|
||||
///
|
||||
/// Ensure that the loop is started otherwise the task is never processed.
|
||||
pub fn schedule_layer_upload(
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
layers_to_upload: HashSet<PathBuf>,
|
||||
metadata: Option<TimelineMetadata>,
|
||||
) {
|
||||
@@ -690,7 +501,7 @@ pub fn schedule_layer_upload(
|
||||
}
|
||||
};
|
||||
sync_queue.push(
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
@@ -708,8 +519,8 @@ pub fn schedule_layer_upload(
|
||||
///
|
||||
/// Ensure that the loop is started otherwise the task is never processed.
|
||||
pub fn schedule_layer_delete(
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
layers_to_delete: HashSet<PathBuf>,
|
||||
) {
|
||||
let sync_queue = match SYNC_QUEUE.get() {
|
||||
@@ -720,7 +531,7 @@ pub fn schedule_layer_delete(
|
||||
}
|
||||
};
|
||||
sync_queue.push(
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
@@ -740,7 +551,7 @@ pub fn schedule_layer_delete(
|
||||
/// On any failure, the task gets retried, omitting already downloaded layers.
|
||||
///
|
||||
/// Ensure that the loop is started otherwise the task is never processed.
|
||||
pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
|
||||
pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) {
|
||||
debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}");
|
||||
let sync_queue = match SYNC_QUEUE.get() {
|
||||
Some(queue) => queue,
|
||||
@@ -750,7 +561,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
|
||||
}
|
||||
};
|
||||
sync_queue.push(
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
@@ -763,9 +574,9 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
|
||||
|
||||
/// Launch a thread to perform remote storage sync tasks.
|
||||
/// See module docs for loop step description.
|
||||
pub(super) fn spawn_storage_sync_thread(
|
||||
pub fn spawn_storage_sync_task(
|
||||
conf: &'static PageServerConf,
|
||||
local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>,
|
||||
storage: GenericRemoteStorage,
|
||||
max_concurrent_timelines_sync: NonZeroUsize,
|
||||
max_sync_errors: NonZeroU32,
|
||||
@@ -779,51 +590,69 @@ pub(super) fn spawn_storage_sync_thread(
|
||||
None => bail!("Could not get sync queue during the sync loop step, aborting"),
|
||||
};
|
||||
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.context("Failed to create storage sync runtime")?;
|
||||
// TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time:
|
||||
// * we need to list every timeline for tenant on S3, that might be a costly operation
|
||||
// * we need to download every timeline for the tenant, to activate it in memory
|
||||
//
|
||||
// When on-demand download gets merged, we're able to do this fast by storing timeline metadata only.
|
||||
let mut empty_tenants = TenantTimelineValues::<LocalTimelineInitStatus>::new();
|
||||
let mut keys_for_index_part_downloads = HashSet::new();
|
||||
let mut timelines_to_sync = HashMap::new();
|
||||
|
||||
let applicable_index_parts = runtime.block_on(download_index_parts(
|
||||
for (tenant_id, timeline_data) in local_timeline_files.0 {
|
||||
if timeline_data.is_empty() {
|
||||
let _ = empty_tenants.0.entry(tenant_id).or_default();
|
||||
} else {
|
||||
for (timeline_id, timeline_data) in timeline_data {
|
||||
let id = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
keys_for_index_part_downloads.insert(id);
|
||||
timelines_to_sync.insert(id, timeline_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts(
|
||||
conf,
|
||||
&storage,
|
||||
local_timeline_files.keys().copied().collect(),
|
||||
keys_for_index_part_downloads,
|
||||
));
|
||||
|
||||
let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?;
|
||||
|
||||
let local_timeline_init_statuses = schedule_first_sync_tasks(
|
||||
&mut runtime.block_on(remote_index.write()),
|
||||
let mut local_timeline_init_statuses = schedule_first_sync_tasks(
|
||||
&mut BACKGROUND_RUNTIME.block_on(remote_index.write()),
|
||||
sync_queue,
|
||||
local_timeline_files,
|
||||
timelines_to_sync,
|
||||
);
|
||||
local_timeline_init_statuses
|
||||
.0
|
||||
.extend(empty_tenants.0.into_iter());
|
||||
|
||||
let remote_index_clone = remote_index.clone();
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::StorageSync,
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::StorageSync,
|
||||
None,
|
||||
None,
|
||||
"Remote storage sync thread",
|
||||
"Remote storage sync task",
|
||||
false,
|
||||
move || {
|
||||
async move {
|
||||
storage_sync_loop(
|
||||
runtime,
|
||||
conf,
|
||||
(storage, remote_index_clone, sync_queue),
|
||||
max_sync_errors,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn remote storage sync thread")?;
|
||||
);
|
||||
Ok(SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
})
|
||||
}
|
||||
|
||||
fn storage_sync_loop(
|
||||
runtime: Runtime,
|
||||
async fn storage_sync_loop(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue),
|
||||
max_sync_errors: NonZeroU32,
|
||||
@@ -834,7 +663,7 @@ fn storage_sync_loop(
|
||||
|
||||
let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch();
|
||||
|
||||
if thread_mgr::is_shutdown_requested() {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
info!("Shutdown requested, stopping");
|
||||
break;
|
||||
}
|
||||
@@ -848,20 +677,19 @@ fn storage_sync_loop(
|
||||
}
|
||||
|
||||
// Concurrently perform all the tasks in the batch
|
||||
let loop_step = runtime.block_on(async {
|
||||
tokio::select! {
|
||||
step = process_batches(
|
||||
conf,
|
||||
max_sync_errors,
|
||||
loop_storage,
|
||||
&index,
|
||||
batched_tasks,
|
||||
sync_queue,
|
||||
)
|
||||
.instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step),
|
||||
_ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()),
|
||||
}
|
||||
});
|
||||
let loop_step = tokio::select! {
|
||||
step = process_batches(
|
||||
conf,
|
||||
max_sync_errors,
|
||||
loop_storage,
|
||||
&index,
|
||||
batched_tasks,
|
||||
sync_queue,
|
||||
)
|
||||
.instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step)
|
||||
,
|
||||
_ = task_mgr::shutdown_watcher() => ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
match loop_step {
|
||||
ControlFlow::Continue(updated_tenants) => {
|
||||
@@ -872,11 +700,8 @@ fn storage_sync_loop(
|
||||
"Sync loop step completed, {} new tenant state update(s)",
|
||||
updated_tenants.len()
|
||||
);
|
||||
let mut timelines_to_attach: HashMap<
|
||||
ZTenantId,
|
||||
Vec<(ZTimelineId, TimelineMetadata)>,
|
||||
> = HashMap::new();
|
||||
let index_accessor = runtime.block_on(index.read());
|
||||
let mut timelines_to_attach = TenantTimelineValues::new();
|
||||
let index_accessor = index.read().await;
|
||||
for tenant_id in updated_tenants {
|
||||
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
||||
Some(tenant_entry) => tenant_entry,
|
||||
@@ -889,19 +714,19 @@ fn storage_sync_loop(
|
||||
};
|
||||
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration");
|
||||
info!("Tenant {tenant_id} has pending timeline downloads, skipping tenant registration");
|
||||
continue;
|
||||
} else {
|
||||
info!(
|
||||
"Tenant {tenant_id} download completed. Picking to register in repository"
|
||||
"Tenant {tenant_id} download completed. Picking to register in tenant"
|
||||
);
|
||||
// Here we assume that if tenant has no in-progress downloads that
|
||||
// means that it is the last completed timeline download that triggered
|
||||
// sync status update. So we look at the index for available timelines
|
||||
// and register them all at once in a repository for download
|
||||
// to be submitted in a single operation to repository
|
||||
// and register them all at once in a tenant for download
|
||||
// to be submitted in a single operation to tenant
|
||||
// so it can apply them at once to internal timeline map.
|
||||
timelines_to_attach.insert(
|
||||
timelines_to_attach.0.insert(
|
||||
tenant_id,
|
||||
tenant_entry
|
||||
.iter()
|
||||
@@ -912,7 +737,7 @@ fn storage_sync_loop(
|
||||
}
|
||||
drop(index_accessor);
|
||||
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
|
||||
attach_downloaded_tenants(conf, &index, timelines_to_attach);
|
||||
attach_local_tenants(conf, &index, timelines_to_attach);
|
||||
}
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -941,9 +766,9 @@ async fn process_batches(
|
||||
max_sync_errors: NonZeroU32,
|
||||
storage: GenericRemoteStorage,
|
||||
index: &RemoteIndex,
|
||||
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
|
||||
batched_tasks: HashMap<TenantTimelineId, SyncTaskBatch>,
|
||||
sync_queue: &SyncQueue,
|
||||
) -> HashSet<ZTenantId> {
|
||||
) -> HashSet<TenantId> {
|
||||
let mut sync_results = batched_tasks
|
||||
.into_iter()
|
||||
.map(|(sync_id, batch)| {
|
||||
@@ -983,7 +808,7 @@ async fn process_sync_task_batch(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue),
|
||||
max_sync_errors: NonZeroU32,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
batch: SyncTaskBatch,
|
||||
) -> DownloadStatus {
|
||||
let sync_start = Instant::now();
|
||||
@@ -1124,7 +949,7 @@ async fn download_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
current_remote_timeline: Option<&RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
new_download_data: SyncData<LayersDownload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
@@ -1174,7 +999,7 @@ async fn download_timeline_data(
|
||||
|
||||
async fn update_local_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
remote_timeline: Option<&RemoteTimeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
let remote_metadata = match remote_timeline {
|
||||
@@ -1206,18 +1031,12 @@ async fn update_local_metadata(
|
||||
info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}");
|
||||
// clone because spawn_blocking requires static lifetime
|
||||
let cloned_metadata = remote_metadata.to_owned();
|
||||
let ZTenantTimelineId {
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
tokio::task::spawn_blocking(move || {
|
||||
crate::layered_repository::save_metadata(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&cloned_metadata,
|
||||
true,
|
||||
)
|
||||
crate::tenant::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true)
|
||||
})
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -1242,7 +1061,7 @@ async fn update_local_metadata(
|
||||
async fn delete_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
mut new_delete_data: SyncData<LayersDeletion>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
@@ -1285,7 +1104,7 @@ async fn upload_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
current_remote_timeline: Option<&RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
new_upload_data: SyncData<LayersUpload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
@@ -1344,7 +1163,7 @@ async fn update_remote_data(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
index: &RemoteIndex,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
update: RemoteDataUpdate<'_>,
|
||||
) -> anyhow::Result<()> {
|
||||
let updated_remote_timeline = {
|
||||
@@ -1442,12 +1261,11 @@ async fn validate_task_retries(
|
||||
fn schedule_first_sync_tasks(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
sync_queue: &SyncQueue,
|
||||
local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
) -> LocalTimelineInitStatuses {
|
||||
let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
|
||||
local_timeline_files: HashMap<TenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
) -> TenantTimelineValues<LocalTimelineInitStatus> {
|
||||
let mut local_timeline_init_statuses = TenantTimelineValues::new();
|
||||
|
||||
let mut new_sync_tasks =
|
||||
VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len()));
|
||||
let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());
|
||||
|
||||
for (sync_id, (local_metadata, local_files)) in local_timeline_files {
|
||||
match index.timeline_entry_mut(&sync_id) {
|
||||
@@ -1459,18 +1277,27 @@ fn schedule_first_sync_tasks(
|
||||
local_files,
|
||||
remote_timeline,
|
||||
);
|
||||
let was_there = local_timeline_init_statuses
|
||||
match local_timeline_init_statuses
|
||||
.0
|
||||
.entry(sync_id.tenant_id)
|
||||
.or_default()
|
||||
.insert(sync_id.timeline_id, timeline_status);
|
||||
|
||||
if was_there.is_some() {
|
||||
// defensive check
|
||||
warn!(
|
||||
"Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}",
|
||||
sync_id.timeline_id
|
||||
);
|
||||
.entry(sync_id.timeline_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
{
|
||||
// defensive check
|
||||
warn!(
|
||||
"Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}",
|
||||
sync_id.timeline_id
|
||||
);
|
||||
}
|
||||
o.insert(timeline_status);
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(timeline_status);
|
||||
}
|
||||
}
|
||||
|
||||
remote_timeline.awaits_download = awaits_download;
|
||||
}
|
||||
None => {
|
||||
@@ -1481,15 +1308,16 @@ fn schedule_first_sync_tasks(
|
||||
SyncTask::upload(LayersUpload {
|
||||
layers_to_upload: local_files,
|
||||
uploaded_layers: HashSet::new(),
|
||||
metadata: Some(local_metadata),
|
||||
metadata: Some(local_metadata.clone()),
|
||||
}),
|
||||
));
|
||||
local_timeline_init_statuses
|
||||
.0
|
||||
.entry(sync_id.tenant_id)
|
||||
.or_default()
|
||||
.insert(
|
||||
sync_id.timeline_id,
|
||||
LocalTimelineInitStatus::LocallyComplete,
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1503,8 +1331,8 @@ fn schedule_first_sync_tasks(
|
||||
|
||||
/// bool in return value stands for awaits_download
|
||||
fn compare_local_and_remote_timeline(
|
||||
new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
|
||||
sync_id: TenantTimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
local_files: HashSet<PathBuf>,
|
||||
remote_entry: &RemoteTimeline,
|
||||
@@ -1523,7 +1351,10 @@ fn compare_local_and_remote_timeline(
|
||||
// we do not need to manipulate with remote consistent lsn here
|
||||
// because it will be updated when sync will be completed
|
||||
} else {
|
||||
(LocalTimelineInitStatus::LocallyComplete, false)
|
||||
(
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()),
|
||||
false,
|
||||
)
|
||||
};
|
||||
|
||||
let layers_to_upload = local_files
|
||||
@@ -1546,7 +1377,7 @@ fn compare_local_and_remote_timeline(
|
||||
}
|
||||
|
||||
fn register_sync_status(
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
sync_start: Instant,
|
||||
sync_name: &str,
|
||||
sync_status: Option<bool>,
|
||||
@@ -1572,13 +1403,13 @@ fn register_sync_status(
|
||||
mod test_utils {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::layered_repository::repo_harness::RepoHarness;
|
||||
use crate::tenant::harness::TenantHarness;
|
||||
|
||||
use super::*;
|
||||
|
||||
pub(super) async fn create_local_timeline(
|
||||
harness: &RepoHarness<'_>,
|
||||
timeline_id: ZTimelineId,
|
||||
harness: &TenantHarness<'_>,
|
||||
timeline_id: TimelineId,
|
||||
filenames: &[&str],
|
||||
metadata: TimelineMetadata,
|
||||
) -> anyhow::Result<LayersUpload> {
|
||||
@@ -1617,14 +1448,14 @@ mod test_utils {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::test_utils::dummy_metadata;
|
||||
use crate::layered_repository::repo_harness::TIMELINE_ID;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use hex_literal::hex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::*;
|
||||
|
||||
const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId {
|
||||
tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")),
|
||||
const TEST_SYNC_ID: TenantTimelineId = TenantTimelineId {
|
||||
tenant_id: TenantId::from_array(hex!("11223344556677881122334455667788")),
|
||||
timeline_id: TIMELINE_ID,
|
||||
};
|
||||
|
||||
@@ -1633,12 +1464,12 @@ mod tests {
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
assert_eq!(sync_queue.len(), 0);
|
||||
|
||||
let sync_id_2 = ZTenantTimelineId {
|
||||
tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||
let sync_id_2 = TenantTimelineId {
|
||||
tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||
timeline_id: TIMELINE_ID,
|
||||
};
|
||||
let sync_id_3 = ZTenantTimelineId {
|
||||
tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")),
|
||||
let sync_id_3 = TenantTimelineId {
|
||||
tenant_id: TenantId::from_array(hex!("33223344556677881122334455667788")),
|
||||
timeline_id: TIMELINE_ID,
|
||||
};
|
||||
assert!(sync_id_2 != TEST_SYNC_ID);
|
||||
@@ -1760,8 +1591,8 @@ mod tests {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk4")]),
|
||||
};
|
||||
|
||||
let sync_id_2 = ZTenantTimelineId {
|
||||
tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||
let sync_id_2 = TenantTimelineId {
|
||||
tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||
timeline_id: TIMELINE_ID,
|
||||
};
|
||||
assert!(sync_id_2 != TEST_SYNC_ID);
|
||||
|
||||
@@ -8,7 +8,7 @@ use tracing::{debug, error, info};
|
||||
|
||||
use crate::storage_sync::{SyncQueue, SyncTask};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use super::{LayersDeletion, SyncData};
|
||||
|
||||
@@ -17,7 +17,7 @@ use super::{LayersDeletion, SyncData};
|
||||
pub(super) async fn delete_timeline_layers(
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
mut delete_data: SyncData<LayersDeletion>,
|
||||
) -> bool {
|
||||
if !delete_data.data.deletion_registered {
|
||||
@@ -112,8 +112,8 @@ mod tests {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
|
||||
@@ -121,9 +121,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline_negative() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline_negative")?;
|
||||
let harness = TenantHarness::create("delete_timeline_negative")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
@@ -154,10 +154,10 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline")?;
|
||||
let harness = TenantHarness::create("delete_timeline")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "c", "d"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
|
||||
@@ -17,31 +17,30 @@ use tokio::{
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||
config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path,
|
||||
TEMP_FILE_SUFFIX,
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersDownload, SyncData, SyncQueue,
|
||||
};
|
||||
|
||||
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
// We collect timelines remotely available for each tenant
|
||||
// in case we failed to gather all index parts (due to an error)
|
||||
// Poisoned variant is returned.
|
||||
// When data is received succesfully without errors Present variant is used.
|
||||
pub enum TenantIndexParts {
|
||||
Poisoned {
|
||||
present: HashMap<ZTimelineId, IndexPart>,
|
||||
missing: HashSet<ZTimelineId>,
|
||||
present: HashMap<TimelineId, IndexPart>,
|
||||
missing: HashSet<TimelineId>,
|
||||
},
|
||||
Present(HashMap<ZTimelineId, IndexPart>),
|
||||
Present(HashMap<TimelineId, IndexPart>),
|
||||
}
|
||||
|
||||
impl TenantIndexParts {
|
||||
fn add_poisoned(&mut self, timeline_id: ZTimelineId) {
|
||||
fn add_poisoned(&mut self, timeline_id: TimelineId) {
|
||||
match self {
|
||||
TenantIndexParts::Poisoned { missing, .. } => {
|
||||
missing.insert(timeline_id);
|
||||
@@ -65,9 +64,9 @@ impl Default for TenantIndexParts {
|
||||
pub async fn download_index_parts(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
keys: HashSet<ZTenantTimelineId>,
|
||||
) -> HashMap<ZTenantId, TenantIndexParts> {
|
||||
let mut index_parts: HashMap<ZTenantId, TenantIndexParts> = HashMap::new();
|
||||
keys: HashSet<TenantTimelineId>,
|
||||
) -> HashMap<TenantId, TenantIndexParts> {
|
||||
let mut index_parts: HashMap<TenantId, TenantIndexParts> = HashMap::new();
|
||||
|
||||
let mut part_downloads = keys
|
||||
.into_iter()
|
||||
@@ -113,8 +112,8 @@ pub async fn download_index_parts(
|
||||
pub async fn gather_tenant_timelines_index_parts(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>> {
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashMap<TimelineId, IndexPart>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id)
|
||||
.await
|
||||
@@ -136,7 +135,7 @@ pub async fn gather_tenant_timelines_index_parts(
|
||||
async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
@@ -198,7 +197,7 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &'a SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
mut download_data: SyncData<LayersDownload>,
|
||||
) -> DownloadedTimeline {
|
||||
let remote_timeline = match remote_timeline {
|
||||
@@ -251,7 +250,7 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
||||
|
||||
let mut destination_file =
|
||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
@@ -336,7 +335,7 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
}
|
||||
|
||||
// fsync timeline directory which is a parent directory for downloaded files
|
||||
let ZTenantTimelineId {
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = &sync_id;
|
||||
@@ -367,8 +366,8 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
async fn get_timeline_sync_ids(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_path: &Path,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<HashSet<ZTenantTimelineId>> {
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashSet<TenantTimelineId>> {
|
||||
let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
@@ -396,11 +395,11 @@ async fn get_timeline_sync_ids(
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: ZTimelineId = object_name.parse().with_context(|| {
|
||||
let timeline_id: TimelineId = object_name.parse().with_context(|| {
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
sync_ids.insert(ZTenantTimelineId {
|
||||
sync_ids.insert(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
@@ -426,21 +425,21 @@ mod tests {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline")?;
|
||||
let harness = TenantHarness::create("download_timeline")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
@@ -538,9 +537,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline_negatives() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline_negatives")?;
|
||||
let harness = TenantHarness::create("download_timeline_negatives")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
@@ -597,8 +596,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_download_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let harness = TenantHarness::create("test_download_index_part")?;
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
|
||||
@@ -15,10 +15,10 @@ use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::log::warn;
|
||||
|
||||
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
|
||||
use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata};
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
use super::download::TenantIndexParts;
|
||||
@@ -49,7 +49,7 @@ impl RelativePath {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TenantEntry(HashMap<ZTimelineId, RemoteTimeline>);
|
||||
pub struct TenantEntry(HashMap<TimelineId, RemoteTimeline>);
|
||||
|
||||
impl TenantEntry {
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
@@ -59,7 +59,7 @@ impl TenantEntry {
|
||||
}
|
||||
|
||||
impl Deref for TenantEntry {
|
||||
type Target = HashMap<ZTimelineId, RemoteTimeline>;
|
||||
type Target = HashMap<TimelineId, RemoteTimeline>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
@@ -72,8 +72,8 @@ impl DerefMut for TenantEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HashMap<ZTimelineId, RemoteTimeline>> for TenantEntry {
|
||||
fn from(inner: HashMap<ZTimelineId, RemoteTimeline>) -> Self {
|
||||
impl From<HashMap<TimelineId, RemoteTimeline>> for TenantEntry {
|
||||
fn from(inner: HashMap<TimelineId, RemoteTimeline>) -> Self {
|
||||
Self(inner)
|
||||
}
|
||||
}
|
||||
@@ -81,7 +81,7 @@ impl From<HashMap<ZTimelineId, RemoteTimeline>> for TenantEntry {
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
entries: HashMap<ZTenantId, TenantEntry>,
|
||||
entries: HashMap<TenantId, TenantEntry>,
|
||||
}
|
||||
|
||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
@@ -91,9 +91,9 @@ pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||
impl RemoteIndex {
|
||||
pub fn from_parts(
|
||||
conf: &'static PageServerConf,
|
||||
index_parts: HashMap<ZTenantId, TenantIndexParts>,
|
||||
index_parts: HashMap<TenantId, TenantIndexParts>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut entries: HashMap<ZTenantId, TenantEntry> = HashMap::new();
|
||||
let mut entries: HashMap<TenantId, TenantEntry> = HashMap::new();
|
||||
|
||||
for (tenant_id, index_parts) in index_parts {
|
||||
match index_parts {
|
||||
@@ -136,30 +136,30 @@ impl Clone for RemoteIndex {
|
||||
impl RemoteTimelineIndex {
|
||||
pub fn timeline_entry(
|
||||
&self,
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &ZTenantTimelineId,
|
||||
}: &TenantTimelineId,
|
||||
) -> Option<&RemoteTimeline> {
|
||||
self.entries.get(tenant_id)?.get(timeline_id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &ZTenantTimelineId,
|
||||
}: &TenantTimelineId,
|
||||
) -> Option<&mut RemoteTimeline> {
|
||||
self.entries.get_mut(tenant_id)?.get_mut(timeline_id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
}: TenantTimelineId,
|
||||
entry: RemoteTimeline,
|
||||
) {
|
||||
self.entries
|
||||
@@ -170,10 +170,10 @@ impl RemoteTimelineIndex {
|
||||
|
||||
pub fn remove_timeline_entry(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
}: TenantTimelineId,
|
||||
) -> Option<RemoteTimeline> {
|
||||
self.entries
|
||||
.entry(tenant_id)
|
||||
@@ -181,25 +181,25 @@ impl RemoteTimelineIndex {
|
||||
.remove(&timeline_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> {
|
||||
pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> {
|
||||
self.entries.get(tenant_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> {
|
||||
pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> {
|
||||
self.entries.get_mut(tenant_id)
|
||||
}
|
||||
|
||||
pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry {
|
||||
pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry {
|
||||
self.entries.entry(tenant_id).or_default()
|
||||
}
|
||||
|
||||
pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option<TenantEntry> {
|
||||
pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option<TenantEntry> {
|
||||
self.entries.remove(tenant_id)
|
||||
}
|
||||
|
||||
pub fn set_awaits_download(
|
||||
&mut self,
|
||||
id: &ZTenantTimelineId,
|
||||
id: &TenantTimelineId,
|
||||
awaits_download: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.timeline_entry_mut(id)
|
||||
@@ -340,11 +340,11 @@ mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::*;
|
||||
use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion() {
|
||||
let harness = RepoHarness::create("index_part_conversion").unwrap();
|
||||
let harness = TenantHarness::create("index_part_conversion").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
@@ -462,7 +462,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion_negatives() {
|
||||
let harness = RepoHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let harness = TenantHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
|
||||
@@ -8,22 +8,20 @@ use remote_storage::GenericRemoteStorage;
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersUpload, SyncData, SyncQueue,
|
||||
};
|
||||
use crate::metrics::NO_LAYERS_UPLOAD;
|
||||
use crate::{
|
||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||
};
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path};
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
index_part: IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
@@ -60,7 +58,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
sync_id: TenantTimelineId,
|
||||
mut upload_data: SyncData<LayersUpload>,
|
||||
) -> UploadedTimeline {
|
||||
let upload = &mut upload_data.data;
|
||||
@@ -153,7 +151,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
// We have run the upload sync task, but the file we wanted to upload is gone.
|
||||
// This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
|
||||
// retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and
|
||||
// run compaction/gc threads, removing redundant files from disk.
|
||||
// run compaction/gc tasks, removing redundant files from disk.
|
||||
// It's not good to pause GC/compaction because of those and we would rather skip such uploads.
|
||||
//
|
||||
// Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance).
|
||||
@@ -202,20 +200,20 @@ mod tests {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::{upload_index_part, *};
|
||||
|
||||
#[tokio::test]
|
||||
async fn regular_layer_upload() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("regular_layer_upload")?;
|
||||
let harness = TenantHarness::create("regular_layer_upload")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
@@ -301,9 +299,9 @@ mod tests {
|
||||
// Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario.
|
||||
#[tokio::test]
|
||||
async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let harness = TenantHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
@@ -396,8 +394,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_upload_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_upload_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let harness = TenantHarness::create("test_upload_index_part")?;
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
|
||||
463
pageserver/src/task_mgr.rs
Normal file
@@ -0,0 +1,463 @@
|
||||
//!
|
||||
//! This module provides centralized handling of tokio tasks in the Page Server.
|
||||
//!
|
||||
//! We provide a few basic facilities:
|
||||
//! - A global registry of tasks that lists what kind of tasks they are, and
|
||||
//! which tenant or timeline they are working on
|
||||
//!
|
||||
//! - The ability to request a task to shut down.
|
||||
//!
|
||||
//!
|
||||
//! # How it works?
|
||||
//!
|
||||
//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new
|
||||
//! task is spawned, a PageServerTask entry is added there, and when a
|
||||
//! task dies, it removes itself from the hashmap. If you want to kill a
|
||||
//! task, you can scan the hashmap to find it.
|
||||
//!
|
||||
//! # Task shutdown
|
||||
//!
|
||||
//! To kill a task, we rely on co-operation from the victim. Each task is
|
||||
//! expected to periodically call the `is_shutdown_requested()` function, and
|
||||
//! if it returns true, exit gracefully. In addition to that, when waiting for
|
||||
//! the network or other long-running operation, you can use
|
||||
//! `shutdown_watcher()` function to get a Future that will become ready if
|
||||
//! the current task has been requested to shut down. You can use that with
|
||||
//! Tokio select!().
|
||||
//!
|
||||
//!
|
||||
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
|
||||
//! Depending on what task panics, we might want to kill the whole server, or
|
||||
//! only a single tenant or timeline.
|
||||
//!
|
||||
|
||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::FutureExt;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::task_local;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
//
|
||||
// Compute request runtime
|
||||
// - used to handle connections from compute nodes. Any tasks related to satisfying
|
||||
// GetPage requests, base backups, import, and other such compute node operations
|
||||
// are handled by the Compute request runtime
|
||||
// - page_service.rs
|
||||
// - this includes layer downloads from remote storage, if a layer is needed to
|
||||
// satisfy a GetPage request
|
||||
//
|
||||
// Management request runtime
|
||||
// - used to handle HTTP API requests
|
||||
//
|
||||
// WAL receiver runtime:
|
||||
// - used to handle WAL receiver connections.
|
||||
// - and to receiver updates from etcd
|
||||
//
|
||||
// Background runtime
|
||||
// - layer flushing
|
||||
// - garbage collection
|
||||
// - compaction
|
||||
// - remote storage uploads
|
||||
// - initial tenant loading
|
||||
//
|
||||
// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct
|
||||
// runtime.
|
||||
//
|
||||
// There might be situations when one task needs to wait for a task running in another
|
||||
// Runtime to finish. For example, if a background operation needs a layer from remote
|
||||
// storage, it will start to download it. If a background operation needs a remote layer,
|
||||
// and the download was already initiated by a GetPage request, the background task
|
||||
// will wait for the download - running in the Page server runtime - to finish.
|
||||
// Another example: the initial tenant loading tasks are launched in the background ops
|
||||
// runtime. If a GetPage request comes in before the load of a tenant has finished, the
|
||||
// GetPage request will wait for the tenant load to finish.
|
||||
//
|
||||
// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to
|
||||
// protect data structures. Let's keep it that way. Synchronous code is easier to debug
|
||||
// and analyze, and there's a lot of hairy, low-level, performance critical code there.
|
||||
//
|
||||
// It's nice to have different runtimes, so that you can quickly eyeball how much CPU
|
||||
// time each class of operations is taking, with 'top -H' or similar.
|
||||
//
|
||||
// It's also good to avoid hogging all threads that would be needed to process
|
||||
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
|
||||
// happen, but still.
|
||||
//
|
||||
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("compute request worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create compute request runtime")
|
||||
});
|
||||
|
||||
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("mgmt request worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create mgmt request runtime")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("walreceiver worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create walreceiver runtime")
|
||||
});
|
||||
|
||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("background op worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
/// Each task that we track is associated with a "task ID". It's just an
|
||||
/// increasing number that we assign. Note that it is different from tokio::task::Id.
|
||||
static NEXT_TASK_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
|
||||
|
||||
/// Global registry of tasks
|
||||
static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
task_local! {
|
||||
// There is a Tokio watch channel for each task, which can be used to signal the
|
||||
// task that it needs to shut down. This task local variable holds the receiving
|
||||
// end of the channel. The sender is kept in the global registry, so that anyone
|
||||
// can send the signal to request task shutdown.
|
||||
static SHUTDOWN_RX: watch::Receiver<bool>;
|
||||
|
||||
// Each task holds reference to its own PageServerTask here.
|
||||
static CURRENT_TASK: Arc<PageServerTask>;
|
||||
}
|
||||
|
||||
///
|
||||
/// There are many kinds of tasks in the system. Some are associated with a particular
|
||||
/// tenant or timeline, while others are global.
|
||||
///
|
||||
/// Note that we don't try to limit how many task of a certain kind can be running
|
||||
/// at the same time.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub enum TaskKind {
|
||||
// libpq listener task. It just accepts connection and spawns a
|
||||
// PageRequestHandler task for each connection.
|
||||
LibpqEndpointListener,
|
||||
|
||||
// HTTP endpoint listener.
|
||||
HttpEndpointListener,
|
||||
|
||||
// Task that handles a single connection. A PageRequestHandler task
|
||||
// starts detached from any particular tenant or timeline, but it can be
|
||||
// associated with one later, after receiving a command from the client.
|
||||
PageRequestHandler,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from etcd, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
WalReceiverManager,
|
||||
|
||||
// Handles a connection to a safekeeper, to stream WAL to a timeline.
|
||||
WalReceiverConnection,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
GarbageCollector,
|
||||
|
||||
// Compaction. One per tenant.
|
||||
Compaction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
// Task that flushes frozen in-memory layers to disk
|
||||
LayerFlushTask,
|
||||
|
||||
// Task that manages the remote upload queue
|
||||
StorageSync,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
// task that handles attaching a tenant
|
||||
Attach,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct MutableTaskState {
|
||||
/// Tenant and timeline that this task is associated with.
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Handle for waiting for the task to exit. It can be None, if the
|
||||
/// the task has already exited.
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
struct PageServerTask {
|
||||
#[allow(dead_code)] // unused currently
|
||||
task_id: PageserverTaskId,
|
||||
|
||||
kind: TaskKind,
|
||||
|
||||
name: String,
|
||||
|
||||
// To request task shutdown, send 'true' to the channel to notify the task.
|
||||
shutdown_tx: watch::Sender<bool>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
}
|
||||
|
||||
/// Launch a new task
|
||||
/// Note: if shutdown_process_on_error is set to true failure
|
||||
/// of the task will lead to shutdown of entire process
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) -> PageserverTaskId
|
||||
where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
{
|
||||
let (shutdown_tx, shutdown_rx) = watch::channel(false);
|
||||
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
||||
let task = Arc::new(PageServerTask {
|
||||
task_id: PageserverTaskId(task_id),
|
||||
kind,
|
||||
name: name.to_string(),
|
||||
shutdown_tx,
|
||||
mutable: Mutex::new(MutableTaskState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
join_handle: None,
|
||||
}),
|
||||
});
|
||||
|
||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
let task_name = name.to_string();
|
||||
let task_cloned = Arc::clone(&task);
|
||||
let join_handle = runtime.spawn(task_wrapper(
|
||||
task_name,
|
||||
task_id,
|
||||
task_cloned,
|
||||
shutdown_rx,
|
||||
shutdown_process_on_error,
|
||||
future,
|
||||
));
|
||||
task_mut.join_handle = Some(join_handle);
|
||||
drop(task_mut);
|
||||
|
||||
// The task is now running. Nothing more to do here
|
||||
PageserverTaskId(task_id)
|
||||
}
|
||||
|
||||
/// This wrapper function runs in a newly-spawned task. It initializes the
|
||||
/// task-local variables and calls the payload function.
|
||||
async fn task_wrapper<F>(
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
task: Arc<PageServerTask>,
|
||||
shutdown_rx: watch::Receiver<bool>,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
{
|
||||
debug!("Starting task '{}'", task_name);
|
||||
|
||||
let result = SHUTDOWN_RX
|
||||
.scope(
|
||||
shutdown_rx,
|
||||
CURRENT_TASK.scope(task, {
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
AssertUnwindSafe(future).catch_unwind()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
task_finish(result, task_name, task_id, shutdown_process_on_error).await;
|
||||
}
|
||||
|
||||
async fn task_finish(
|
||||
result: std::result::Result<
|
||||
anyhow::Result<()>,
|
||||
std::boxed::Box<dyn std::any::Any + std::marker::Send>,
|
||||
>,
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
shutdown_process_on_error: bool,
|
||||
) {
|
||||
// Remove our entry from the global hashmap.
|
||||
let task = TASKS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&task_id)
|
||||
.expect("no task in registry");
|
||||
|
||||
let mut shutdown_process = false;
|
||||
{
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
}
|
||||
Ok(Err(err)) => {
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
shutdown_pageserver(1).await;
|
||||
}
|
||||
}
|
||||
|
||||
// expected to be called from the task of the given id.
|
||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
||||
CURRENT_TASK.with(|ct| {
|
||||
let mut task_mut = ct.mutable.lock().unwrap();
|
||||
task_mut.tenant_id = tenant_id;
|
||||
task_mut.timeline_id = timeline_id;
|
||||
});
|
||||
}
|
||||
|
||||
/// Is there a task running that matches the criteria
|
||||
|
||||
/// Signal and wait for tasks to shut down.
|
||||
///
|
||||
///
|
||||
/// The arguments are used to select the tasks to kill. Any None arguments are
|
||||
/// ignored. For example, to shut down all WalReceiver tasks:
|
||||
///
|
||||
/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None)
|
||||
///
|
||||
/// Or to shut down all tasks for given timeline:
|
||||
///
|
||||
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
|
||||
///
|
||||
pub async fn shutdown_tasks(
|
||||
kind: Option<TaskKind>,
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
) {
|
||||
let mut victim_tasks = Vec::new();
|
||||
|
||||
{
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
{
|
||||
let _ = task.shutdown_tx.send_replace(true);
|
||||
victim_tasks.push(Arc::clone(task));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for task in victim_tasks {
|
||||
let join_handle = {
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
let join_handle = task_mut.join_handle.take();
|
||||
drop(task_mut);
|
||||
join_handle
|
||||
};
|
||||
if let Some(join_handle) = join_handle {
|
||||
let _ = join_handle.await;
|
||||
} else {
|
||||
// Possibly one of:
|
||||
// * The task had not even fully started yet.
|
||||
// * It was shut down concurrently and already exited
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_task_kind() -> Option<TaskKind> {
|
||||
CURRENT_TASK.try_with(|ct| ct.kind).ok()
|
||||
}
|
||||
|
||||
/// A Future that can be used to check if the current task has been requested to
|
||||
/// shut down.
|
||||
pub async fn shutdown_watcher() {
|
||||
let mut shutdown_rx = SHUTDOWN_RX
|
||||
.try_with(|rx| rx.clone())
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
while !*shutdown_rx.borrow() {
|
||||
if shutdown_rx.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Has the current task been requested to shut down?
|
||||
pub fn is_shutdown_requested() -> bool {
|
||||
if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
|
||||
*shutdown_rx.borrow()
|
||||
} else {
|
||||
if !cfg!(test) {
|
||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -11,8 +11,8 @@
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use crate::layered_repository::block_io::{BlockCursor, BlockReader};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
@@ -60,7 +60,7 @@ where
|
||||
/// the underlying BlockReader. For example:
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader};
|
||||
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
/// # let reader: FileBlockReader<std::fs::File> = todo!();
|
||||
/// let cursor = reader.block_cursor();
|
||||
/// let buf = cursor.read_blk(1);
|
||||
@@ -7,7 +7,7 @@
|
||||
//! must be page images or WAL records with the 'will_init' flag set, so that
|
||||
//! they can be replayed without referring to an older page version.
|
||||
//!
|
||||
//! The delta files are stored in timelines/<timelineid> directory. Currently,
|
||||
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
|
||||
//! there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN
|
||||
@@ -24,17 +24,15 @@
|
||||
//! "values" part.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -50,8 +48,8 @@ use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
///
|
||||
@@ -62,12 +60,12 @@ use utils::{
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
/// Magic value to identify this as a zenith delta file. Always DELTA_FILE_MAGIC.
|
||||
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
@@ -83,8 +81,8 @@ impl From<&DeltaLayer> for Summary {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
tenant_id: layer.tenant_id,
|
||||
timeline_id: layer.timeline_id,
|
||||
key_range: layer.key_range.clone(),
|
||||
lsn_range: layer.lsn_range.clone(),
|
||||
|
||||
@@ -175,8 +173,8 @@ impl DeltaKey {
|
||||
pub struct DeltaLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
@@ -196,12 +194,12 @@ pub struct DeltaLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
@@ -346,8 +344,8 @@ impl Layer for DeltaLayer {
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
@@ -421,22 +419,22 @@ impl Layer for DeltaLayer {
|
||||
impl DeltaLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
fname: &DeltaFileName,
|
||||
) -> PathBuf {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.clone(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) -> PathBuf {
|
||||
@@ -446,12 +444,13 @@ impl DeltaLayer {
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.{}.temp",
|
||||
conf.timeline_path(&timeline_id, &tenant_id).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end),
|
||||
rand_string
|
||||
rand_string,
|
||||
TEMP_FILE_SUFFIX,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -536,14 +535,14 @@ impl DeltaLayer {
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &DeltaFileName,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn_range: filename.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
@@ -569,8 +568,8 @@ impl DeltaLayer {
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
lsn_range: summary.lsn_range,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
@@ -593,8 +592,8 @@ impl DeltaLayer {
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
@@ -614,8 +613,8 @@ impl DeltaLayer {
|
||||
pub struct DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -631,8 +630,8 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
@@ -642,7 +641,7 @@ impl DeltaLayerWriter {
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range);
|
||||
let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range);
|
||||
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
@@ -657,8 +656,8 @@ impl DeltaLayerWriter {
|
||||
Ok(DeltaLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
tree: tree_builder,
|
||||
@@ -719,8 +718,8 @@ impl DeltaLayerWriter {
|
||||
let summary = Summary {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
index_start_blk,
|
||||
@@ -734,8 +733,8 @@ impl DeltaLayerWriter {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
@@ -754,8 +753,8 @@ impl DeltaLayerWriter {
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&DeltaFileName {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range,
|
||||
@@ -25,7 +25,7 @@ use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
|
||||
use crate::layered_repository::block_io::{BlockReader, BlockWriter};
|
||||
use crate::tenant::block_io::{BlockReader, BlockWriter};
|
||||
|
||||
// The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
|
||||
pub const VALUE_SZ: usize = 5;
|
||||
@@ -2,11 +2,11 @@
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::blob_io::BlobWriter;
|
||||
use crate::layered_repository::block_io::BlockReader;
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{ReadBufResult, WriteBufResult};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::cmp::min;
|
||||
@@ -17,7 +17,7 @@ use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tracing::*;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -39,8 +39,8 @@ pub struct EphemeralFiles {
|
||||
|
||||
pub struct EphemeralFile {
|
||||
file_id: u64,
|
||||
_tenantid: ZTenantId,
|
||||
_timelineid: ZTimelineId,
|
||||
_tenant_id: TenantId,
|
||||
_timeline_id: TimelineId,
|
||||
file: Arc<VirtualFile>,
|
||||
|
||||
pub size: u64,
|
||||
@@ -49,15 +49,15 @@ pub struct EphemeralFile {
|
||||
impl EphemeralFile {
|
||||
pub fn create(
|
||||
conf: &PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
let mut l = EPHEMERAL_FILES.write().unwrap();
|
||||
let file_id = l.next_file_id;
|
||||
l.next_file_id += 1;
|
||||
|
||||
let filename = conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
@@ -69,8 +69,8 @@ impl EphemeralFile {
|
||||
|
||||
Ok(EphemeralFile {
|
||||
file_id,
|
||||
_tenantid: tenantid,
|
||||
_timelineid: timelineid,
|
||||
_tenant_id: tenant_id,
|
||||
_timeline_id: timeline_id,
|
||||
file: file_rc,
|
||||
size: 0,
|
||||
})
|
||||
@@ -330,15 +330,15 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::layered_repository::block_io::BlockCursor;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
fn repo_harness(
|
||||
fn harness(
|
||||
test_name: &str,
|
||||
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> {
|
||||
) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
@@ -346,11 +346,11 @@ mod tests {
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
|
||||
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
|
||||
|
||||
Ok((conf, tenantid, timelineid))
|
||||
Ok((conf, tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
@@ -368,9 +368,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> Result<(), io::Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
|
||||
|
||||
let file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
file_a.write_all_at(b"foo", 0)?;
|
||||
assert_eq!("foo", read_string(&file_a, 0, 20)?);
|
||||
@@ -381,7 +381,7 @@ mod tests {
|
||||
// Open a lot of files, enough to cause some page evictions.
|
||||
let mut efiles = Vec::new();
|
||||
for fileno in 0..100 {
|
||||
let efile = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
|
||||
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
|
||||
efiles.push((fileno, efile));
|
||||
@@ -399,9 +399,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?;
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
@@ -4,7 +4,7 @@
|
||||
//! but does not exist in the layer, does not exist.
|
||||
//!
|
||||
//! An image layer is stored in a file on disk. The file is stored in
|
||||
//! timelines/<timelineid> directory. Currently, there are no
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
//! subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! <key start>-<key end>__<LSN>
|
||||
@@ -20,17 +20,15 @@
|
||||
//! mapping from Key to an offset in the "values" part. The
|
||||
//! actual page images are stored in the "values" part.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{ImageFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
@@ -46,8 +44,8 @@ use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
///
|
||||
@@ -58,12 +56,12 @@ use utils::{
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
/// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC.
|
||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
@@ -79,8 +77,8 @@ impl From<&ImageLayer> for Summary {
|
||||
Self {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
tenant_id: layer.tenant_id,
|
||||
timeline_id: layer.timeline_id,
|
||||
key_range: layer.key_range.clone(),
|
||||
lsn: layer.lsn,
|
||||
|
||||
@@ -99,8 +97,8 @@ impl From<&ImageLayer> for Summary {
|
||||
///
|
||||
pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
@@ -130,12 +128,12 @@ impl Layer for ImageLayer {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
@@ -204,7 +202,7 @@ impl Layer for ImageLayer {
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} ----",
|
||||
self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
|
||||
self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -230,22 +228,22 @@ impl Layer for ImageLayer {
|
||||
impl ImageLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
fname: &ImageFileName,
|
||||
) -> PathBuf {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.to_path_buf(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
fname: &ImageFileName,
|
||||
) -> PathBuf {
|
||||
let rand_string: String = rand::thread_rng()
|
||||
@@ -254,8 +252,8 @@ impl ImageLayer {
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(format!("{}.{}.temp", fname, rand_string))
|
||||
conf.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
||||
}
|
||||
|
||||
///
|
||||
@@ -338,14 +336,14 @@ impl ImageLayer {
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &ImageFileName,
|
||||
) -> ImageLayer {
|
||||
ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn: filename.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
@@ -371,8 +369,8 @@ impl ImageLayer {
|
||||
|
||||
Ok(ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
lsn: summary.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
@@ -395,8 +393,8 @@ impl ImageLayer {
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
@@ -416,8 +414,8 @@ impl ImageLayer {
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
@@ -428,8 +426,8 @@ pub struct ImageLayerWriter {
|
||||
impl ImageLayerWriter {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
@@ -437,8 +435,8 @@ impl ImageLayerWriter {
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&ImageFileName {
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
@@ -460,8 +458,8 @@ impl ImageLayerWriter {
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
@@ -504,8 +502,8 @@ impl ImageLayerWriter {
|
||||
let summary = Summary {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
index_start_blk,
|
||||
@@ -519,8 +517,8 @@ impl ImageLayerWriter {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
timelineid: self.timelineid,
|
||||
tenantid: self.tenantid,
|
||||
timeline_id: self.timeline_id,
|
||||
tenant_id: self.tenant_id,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
@@ -540,8 +538,8 @@ impl ImageLayerWriter {
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
@@ -5,14 +5,12 @@
|
||||
//! its position in the file, is kept in memory, though.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::layered_repository::block_io::BlockReader;
|
||||
use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use std::cell::RefCell;
|
||||
@@ -20,9 +18,9 @@ use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
vec_map::VecMap,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
@@ -39,8 +37,8 @@ thread_local! {
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
@@ -96,12 +94,12 @@ impl Layer for InMemoryLayer {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
@@ -199,7 +197,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} LSNs {}-{} ----",
|
||||
self.timelineid, self.start_lsn, end_str,
|
||||
self.timeline_id, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -253,22 +251,18 @@ impl InMemoryLayer {
|
||||
///
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
"initializing new empty InMemoryLayer for writing on timeline {} at {}",
|
||||
timelineid,
|
||||
start_lsn
|
||||
);
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
start_lsn,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
@@ -283,7 +277,7 @@ impl InMemoryLayer {
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
|
||||
@@ -346,8 +340,8 @@ impl InMemoryLayer {
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
Key::MIN,
|
||||
self.start_lsn..inner.end_lsn.unwrap(),
|
||||
)?;
|
||||
@@ -2,7 +2,7 @@
|
||||
//! The layer map tracks what layers exist in a timeline.
|
||||
//!
|
||||
//! When the timeline is first accessed, the server lists of all layer files
|
||||
//! in the timelines/<timelineid> directory, and populates this map with
|
||||
//! in the timelines/<timeline_id> directory, and populates this map with
|
||||
//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
|
||||
//! new WAL record is received, we create an InMemoryLayer to hold the incoming
|
||||
//! records. Now and then, in the checkpoint() function, the in-memory layer is
|
||||
@@ -10,11 +10,11 @@
|
||||
//! corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::inmemory_layer::InMemoryLayer;
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
|
||||
use crate::metrics::NUM_ONDISK_LAYERS;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
@@ -1,4 +1,4 @@
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::Repository`]
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
@@ -15,8 +15,8 @@ use serde::{Deserialize, Serialize};
|
||||
use tracing::info_span;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
@@ -63,7 +63,7 @@ struct TimelineMetadataBody {
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_timeline: Option<TimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
@@ -73,7 +73,7 @@ impl TimelineMetadata {
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_timeline: Option<TimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
@@ -149,7 +149,7 @@ impl TimelineMetadata {
|
||||
self.body.prev_record_lsn
|
||||
}
|
||||
|
||||
pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
|
||||
pub fn ancestor_timeline(&self) -> Option<TimelineId> {
|
||||
self.body.ancestor_timeline
|
||||
}
|
||||
|
||||
@@ -170,23 +170,23 @@ impl TimelineMetadata {
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
conf.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
let path = metadata_path(conf, timeline_id, tenant_id);
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -216,7 +216,7 @@ pub fn save_metadata(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::repo_harness::TIMELINE_ID;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
@@ -3,15 +3,15 @@
|
||||
//!
|
||||
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
@@ -50,7 +50,7 @@ where
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct ValueReconstructState {
|
||||
pub records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
pub records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
@@ -84,10 +84,10 @@ pub enum ValueReconstructResult {
|
||||
/// LSN
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
@@ -5,18 +5,19 @@ use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tokio::task::spawn_blocking;
|
||||
use tracing::*;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{fs, thread};
|
||||
|
||||
use crate::layered_repository::{
|
||||
use crate::tenant::{
|
||||
delta_layer::{DeltaLayer, DeltaLayerWriter},
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
filename::{DeltaFileName, ImageFileName},
|
||||
@@ -38,16 +39,17 @@ use crate::tenant_config::TenantConfOpt;
|
||||
|
||||
use postgres_ffi::v14::xlog_utils::to_pg_timestamp;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
seqwait::SeqWait,
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::thread_mgr;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{page_cache, storage_sync};
|
||||
@@ -56,8 +58,8 @@ pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
|
||||
tenant_id: ZTenantId,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
pub layers: RwLock<LayerMap>,
|
||||
|
||||
@@ -110,13 +112,13 @@ pub struct Timeline {
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
|
||||
/// Used to ensure that there is only one thread
|
||||
/// Used to ensure that there is only task performing flushing at a time
|
||||
layer_flush_lock: Mutex<()>,
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other threads.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Repository::delete_timeline`].
|
||||
/// and [`Tenant::delete_timeline`].
|
||||
layer_removal_cs: Mutex<()>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
@@ -142,10 +144,7 @@ pub struct Timeline {
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
// TODO task management should be done outside timeline, managed along with other tasks.
|
||||
#[allow(clippy::type_complexity)]
|
||||
initial_size_computation_task:
|
||||
Mutex<Option<(thread::JoinHandle<anyhow::Result<()>>, mpsc::Receiver<()>)>>,
|
||||
initial_size_computation_started: AtomicBool,
|
||||
|
||||
/// Information about the last processed message by the WAL receiver,
|
||||
/// or None if WAL receiver has not received anything for this timeline
|
||||
@@ -313,7 +312,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get the ancestor's timeline id
|
||||
pub fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
|
||||
pub fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id)
|
||||
@@ -413,23 +412,23 @@ impl Timeline {
|
||||
/// You should call this before any of the other get_* or list_* functions. Calling
|
||||
/// those functions with an LSN that has been processed yet is an error.
|
||||
///
|
||||
pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
// This should never be called from the WAL receiver, because that could lead
|
||||
// to a deadlock.
|
||||
ensure!(
|
||||
!IS_WAL_RECEIVER.with(|c| c.get()),
|
||||
"wait_lsn called by WAL receiver thread"
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
|
||||
self.metrics.wait_lsn_time_histo.observe_closure_duration(
|
||||
|| self.last_record_lsn
|
||||
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
|
||||
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
|
||||
)
|
||||
}))?;
|
||||
let _timer = self.metrics.wait_lsn_time_histo.start_timer();
|
||||
|
||||
self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await
|
||||
.with_context(||
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
|
||||
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
|
||||
)
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -532,8 +531,8 @@ impl Timeline {
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
upload_layers: bool,
|
||||
) -> Timeline {
|
||||
@@ -587,7 +586,7 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
initial_size_computation_task: Mutex::new(None),
|
||||
initial_size_computation_started: AtomicBool::new(false),
|
||||
partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
repartition_threshold: 0,
|
||||
|
||||
@@ -598,6 +597,43 @@ impl Timeline {
|
||||
result
|
||||
}
|
||||
|
||||
pub fn launch_wal_receiver(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
if !is_etcd_client_initialized() {
|
||||
if cfg!(test) {
|
||||
info!("not launching WAL receiver because etcd client hasn't been initialized");
|
||||
return Ok(());
|
||||
} else {
|
||||
panic!("etcd client not initialized");
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"launching WAL receiver for timeline {} of tenant {}",
|
||||
self.timeline_id, self.tenant_id
|
||||
);
|
||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
||||
let lagging_wal_timeout = tenant_conf_guard
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
||||
let walreceiver_connect_timeout = tenant_conf_guard
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||
let max_lsn_wal_lag = tenant_conf_guard
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
drop(tenant_conf_guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
let _ = spawn_connection_manager_task(
|
||||
self.conf.broker_etcd_prefix.clone(),
|
||||
self_clone,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map.
|
||||
/// Returns all timeline-related files that were found and loaded.
|
||||
@@ -715,61 +751,34 @@ impl Timeline {
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
|
||||
let timeline_id = self.timeline_id;
|
||||
|
||||
let mut task_guard = match self.initial_size_computation_task.try_lock() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
debug!("Skipping timeline logical size init: task lock is taken already");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some((old_task, task_finish_signal)) = task_guard.take() {
|
||||
// TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead
|
||||
match task_finish_signal.try_recv() {
|
||||
// task has either signaled successfully that it finished or panicked and dropped the sender part without signalling
|
||||
Ok(()) | Err(mpsc::TryRecvError::Disconnected) => {
|
||||
match old_task.join() {
|
||||
// we're here due to OnceCell::get not returning the value
|
||||
Ok(Ok(())) => {
|
||||
error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation")
|
||||
}
|
||||
Ok(Err(task_error)) => {
|
||||
error!("Error during timeline {timeline_id} size init: {task_error:?}")
|
||||
}
|
||||
Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"),
|
||||
}
|
||||
}
|
||||
// task had not yet finished: no signal was sent and the sender channel is not dropped
|
||||
Err(mpsc::TryRecvError::Empty) => {
|
||||
// let the task finish
|
||||
*task_guard = Some((old_task, task_finish_signal));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if task_guard.is_none() {
|
||||
let thread_timeline = Arc::clone(self);
|
||||
let (finish_sender, finish_receiver) = mpsc::channel();
|
||||
|
||||
match thread::Builder::new()
|
||||
.name(format!(
|
||||
"Timeline {timeline_id} initial logical size calculation"
|
||||
))
|
||||
.spawn(move || {
|
||||
let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered();
|
||||
let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?;
|
||||
match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) {
|
||||
// Atomically check if the timeline size calculation had already started.
|
||||
// If the flag was not already set, this sets it.
|
||||
if !self
|
||||
.initial_size_computation_started
|
||||
.swap(true, AtomicOrdering::SeqCst)
|
||||
{
|
||||
// We need to start the computation task.
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"initial size calculation",
|
||||
false,
|
||||
async move {
|
||||
let calculated_size = self_clone.calculate_logical_size(init_lsn)?;
|
||||
let result = spawn_blocking(move || {
|
||||
self_clone.current_logical_size.initial_logical_size.set(calculated_size)
|
||||
}).await?;
|
||||
match result {
|
||||
Ok(()) => info!("Successfully calculated initial logical size"),
|
||||
Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
|
||||
}
|
||||
|
||||
finish_sender.send(()).ok();
|
||||
Ok(())
|
||||
}) {
|
||||
Ok(guard) => *task_guard = Some((guard, finish_receiver)),
|
||||
Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1099,22 +1108,23 @@ impl Timeline {
|
||||
self.last_freeze_at.store(last_lsn);
|
||||
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
|
||||
|
||||
// Launch a thread to flush the frozen layer to disk, unless
|
||||
// a thread was already running. (If the thread was running
|
||||
// Launch a task to flush the frozen layer to disk, unless
|
||||
// a task was already running. (If the task was running
|
||||
// at the time that we froze the layer, it must've seen the
|
||||
// the layer we just froze before it exited; see comments
|
||||
// in flush_frozen_layers())
|
||||
if let Ok(guard) = self.layer_flush_lock.try_lock() {
|
||||
drop(guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
thread_mgr::spawn(
|
||||
thread_mgr::ThreadKind::LayerFlushThread,
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush thread",
|
||||
"layer flush task",
|
||||
false,
|
||||
move || self_clone.flush_frozen_layers(false),
|
||||
)?;
|
||||
async move { self_clone.flush_frozen_layers(false) },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1123,8 +1133,8 @@ impl Timeline {
|
||||
|
||||
/// Flush all frozen layers to disk.
|
||||
///
|
||||
/// Only one thread at a time can be doing layer-flushing for a
|
||||
/// given timeline. If 'wait' is true, and another thread is
|
||||
/// Only one task at a time can be doing layer-flushing for a
|
||||
/// given timeline. If 'wait' is true, and another task is
|
||||
/// currently doing the flushing, this function will wait for it
|
||||
/// to finish. If 'wait' is false, this function will return
|
||||
/// immediately instead.
|
||||
@@ -1240,7 +1250,7 @@ impl Timeline {
|
||||
None
|
||||
};
|
||||
|
||||
let ancestor_timelineid = self
|
||||
let ancestor_timeline_id = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id);
|
||||
@@ -1248,7 +1258,7 @@ impl Timeline {
|
||||
let metadata = TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timelineid,
|
||||
ancestor_timeline_id,
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
@@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use utils::zid::ZTenantId;
|
||||
use utils::id::TenantId;
|
||||
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
@@ -226,8 +226,8 @@ impl TenantConf {
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf {
|
||||
conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME)
|
||||
pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf {
|
||||
conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||