Compare commits

..

1 Commits

Author SHA1 Message Date
Christian Schwarz
b144f22be4 [DNM] matrix run test suite with legacy & tiered compaction 2024-05-28 16:29:05 +00:00
112 changed files with 1483 additions and 3511 deletions

View File

@@ -24,7 +24,7 @@ jobs:
actionlint:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: reviewdog/action-actionlint@v1
@@ -36,15 +36,3 @@ jobs:
fail_on_error: true
filter_mode: nofilter
level: error
- run: |
PAT='^\s*runs-on:.*-latest'
if grep -ERq $PAT .github/workflows
then
grep -ERl $PAT .github/workflows |\
while read -r f
do
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
done
exit 1
fi

View File

@@ -44,7 +44,7 @@ jobs:
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
github.event.action == 'closed' &&
github.event.pull_request.head.repo.full_name != github.repository
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch

View File

@@ -38,11 +38,6 @@ on:
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
required: false
default: false
run_only_pgvector_tests:
type: boolean
description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
required: false
default: false
defaults:
run:
@@ -55,7 +50,6 @@ concurrency:
jobs:
bench:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -126,7 +120,6 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
generate-matrices:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
# Available platforms:
@@ -137,7 +130,7 @@ jobs:
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
@@ -204,7 +197,6 @@ jobs:
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
pgbench-compare:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
needs: [ generate-matrices ]
strategy:
@@ -351,92 +343,6 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-captest-pgvector"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark pgvector hnsw indexing
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Benchmark pgvector hnsw queries
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
@@ -445,7 +351,7 @@ jobs:
#
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, pgbench-compare ]
strategy:
@@ -549,7 +455,7 @@ jobs:
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, clickbench-compare ]
strategy:
@@ -651,7 +557,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
user-examples-compare:
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, tpch-compare ]
strategy:

View File

@@ -88,7 +88,7 @@ jobs:
merge-images:
needs: [ build-image ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
IMAGE_TAG: ${{ inputs.image-tag }}

View File

@@ -35,7 +35,7 @@ jobs:
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
@@ -432,8 +432,9 @@ jobs:
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
pg_version: [ v14, v15, v16 ]
build_type: [ debug ]
pg_version: [ v15 ]
pageserver_compaction_algorithm_kind: [ "legacy", "tiered" ]
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -461,6 +462,9 @@ jobs:
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM: 'kind="${{ matrix.pageserver_compaction_algorithm_kind }}"'
# catch the tests that override `tenant_config` as a whole without specifying the compaction algorithm `kind`
NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
@@ -549,7 +553,7 @@ jobs:
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- uses: slackapi/slack-github-action@v1
@@ -774,7 +778,7 @@ jobs:
neon-image:
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- uses: docker/login-action@v3
@@ -884,7 +888,7 @@ jobs:
compute-node-image:
needs: [ compute-node-image-arch, tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
strategy:
matrix:
@@ -1032,7 +1036,7 @@ jobs:
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
VERSIONS: v14 v15 v16
@@ -1077,7 +1081,7 @@ jobs:
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Set PR's status to pending and request a remote CI test
run: |

View File

@@ -19,7 +19,7 @@ permissions: {}
jobs:
check-image:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
found: ${{ steps.check-image.outputs.found }}

View File

@@ -16,7 +16,7 @@ permissions: {}
jobs:
check-permissions:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Disallow CI runs on PRs from forks
if: |

View File

@@ -9,7 +9,7 @@ on:
jobs:
cleanup:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cleanup
run: |

View File

@@ -20,7 +20,7 @@ concurrency:
jobs:
test-postgres-client-libs:
# TODO: switch to gen2 runner, requires docker
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
env:
DEFAULT_PG_VERSION: 14

View File

@@ -26,7 +26,7 @@ permissions: {}
jobs:
tag-image:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
FROM_TAG: ${{ inputs.from-tag }}

View File

@@ -19,7 +19,7 @@ on:
jobs:
notify:
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
steps:
- uses: neondatabase/dev-actions/release-pr-notify@main

View File

@@ -26,7 +26,7 @@ defaults:
jobs:
create-storage-release-branch:
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:
create-proxy-release-branch:
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`

View File

@@ -19,7 +19,7 @@ env:
jobs:
cancel-previous-e2e-tests:
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
@@ -62,7 +62,7 @@ jobs:
trigger-e2e-tests:
needs: [ tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
TAG: ${{ needs.tag.outputs.build-tag }}
steps:

168
Cargo.lock generated
View File

@@ -776,6 +776,7 @@ dependencies = [
"pin-project",
"serde",
"time",
"tz-rs",
"url",
"uuid",
]
@@ -1290,6 +1291,12 @@ dependencies = [
"tiny-keccak",
]
[[package]]
name = "const_fn"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
[[package]]
name = "const_format"
version = "0.2.30"
@@ -1969,6 +1976,21 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.1.0"
@@ -2598,6 +2620,19 @@ dependencies = [
"tokio-io-timeout",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper 0.14.26",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
@@ -2915,12 +2950,6 @@ version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "linux-raw-sys"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
[[package]]
name = "lock_api"
version = "0.4.10"
@@ -3139,6 +3168,24 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "native-tls"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
dependencies = [
"lazy_static",
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "nix"
version = "0.25.1"
@@ -3309,6 +3356,15 @@ dependencies = [
"libc",
]
[[package]]
name = "num_threads"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
dependencies = [
"libc",
]
[[package]]
name = "oauth2"
version = "4.4.2"
@@ -3358,12 +3414,50 @@ version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "openssl"
version = "0.10.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
dependencies = [
"bitflags 2.4.1",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "opentelemetry"
version = "0.20.0"
@@ -3570,7 +3664,6 @@ dependencies = [
"serde",
"serde_json",
"svg_fmt",
"thiserror",
"tokio",
"tokio-util",
"toml_edit",
@@ -4012,6 +4105,17 @@ dependencies = [
"tokio-postgres",
]
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"native-tls",
"tokio",
"tokio-native-tls",
"tokio-postgres",
]
[[package]]
name = "postgres-protocol"
version = "0.6.4"
@@ -4120,7 +4224,6 @@ version = "0.1.0"
dependencies = [
"byteorder",
"bytes",
"itertools",
"pin-project-lite",
"postgres-protocol",
"rand 0.8.5",
@@ -4310,7 +4413,6 @@ dependencies = [
"http 1.1.0",
"http-body-util",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-util",
@@ -4321,6 +4423,7 @@ dependencies = [
"md5",
"measured",
"metrics",
"native-tls",
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
@@ -4328,6 +4431,7 @@ dependencies = [
"parquet_derive",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
"postgres-protocol",
"postgres_backend",
"pq_proto",
@@ -4346,7 +4450,6 @@ dependencies = [
"rstest",
"rustc-hash",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"scopeguard",
"serde",
@@ -4376,6 +4479,7 @@ dependencies = [
"utils",
"uuid",
"walkdir",
"webpki-roots 0.25.2",
"workspace_hack",
"x509-parser",
]
@@ -4682,21 +4786,20 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls 0.21.11",
"rustls-pemfile 1.0.2",
"serde",
"serde_json",
"serde_urlencoded",
"tokio",
"tokio-rustls 0.24.0",
"tokio-native-tls",
"tokio-util",
"tower-service",
"url",
@@ -4704,7 +4807,6 @@ dependencies = [
"wasm-bindgen-futures",
"wasm-streams 0.3.0",
"web-sys",
"webpki-roots 0.25.2",
"winreg 0.50.0",
]
@@ -5130,22 +5232,20 @@ dependencies = [
"hex",
"histogram",
"itertools",
"once_cell",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
@@ -6089,6 +6189,8 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
dependencies = [
"itoa",
"js-sys",
"libc",
"num_threads",
"serde",
"time-core",
"time-macros",
@@ -6164,7 +6266,7 @@ dependencies = [
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"futures",
"nix 0.26.4",
@@ -6198,6 +6300,16 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.7"
@@ -6604,6 +6716,15 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
[[package]]
name = "tz-rs"
version = "0.6.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
dependencies = [
"const_fn",
]
[[package]]
name = "uname"
version = "0.1.1"
@@ -6676,12 +6797,11 @@ dependencies = [
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"bytes",
"io-uring",
"libc",
"linux-raw-sys 0.6.4",
]
[[package]]
@@ -7509,9 +7629,9 @@ dependencies = [
[[package]]
name = "zeroize"
version = "1.7.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
dependencies = [
"zeroize_derive",
]

View File

@@ -46,10 +46,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_core = "0.19"
azure_identity = "0.19"
azure_storage = "0.19"
azure_storage_blobs = "0.19"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -114,6 +114,7 @@ md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
num_cpus = "1.15"
@@ -190,7 +191,7 @@ url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.7"
webpki-roots = "0.25"
x509-parser = "0.15"
## TODO replace this with tracing
@@ -199,6 +200,7 @@ log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -239,7 +241,8 @@ tonic-build = "0.9"
[patch.crates-io]
# Needed to get `tokio-postgres-rustls` to depend on our fork.
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID

View File

@@ -99,13 +99,6 @@ name = "async-executor"
[[bans.deny]]
name = "smol"
[[bans.deny]]
# We want to use rustls instead of the platform's native tls implementation.
name = "native-tls"
[[bans.deny]]
name = "openssl"
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html

View File

@@ -1,5 +1,6 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
@@ -52,8 +53,14 @@ impl Key {
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
// Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
Self::from_i128(i128::from_be_bytes(*key))
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
@@ -61,6 +68,17 @@ impl Key {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub const fn metadata_key_range() -> Range<Self> {
Key {
@@ -103,7 +121,7 @@ impl Key {
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0x7F) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
@@ -157,7 +175,7 @@ impl Key {
}
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self {
Key {
field1: b[0],
@@ -170,7 +188,7 @@ impl Key {
}
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2);
@@ -381,15 +399,10 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0xffff_ffff,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn is_rel_size_key(key: &Key) -> bool {
key.field1 == 0 && key.field6 == u32::MAX
}
#[inline(always)]
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
Key {
@@ -427,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
}
}
#[inline(always)]
pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
if key.field1 == 0x01
&& key.field3 == 0
&& key.field4 == 0
&& key.field5 == 0
&& key.field6 == 0
{
match key.field2 {
0 => Some(Ok(SlruKind::Clog)),
1 => Some(Ok(SlruKind::MultiXactMembers)),
2 => Some(Ok(SlruKind::MultiXactOffsets)),
x => Some(Err(x)),
}
} else {
None
}
}
#[inline(always)]
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
Key {
@@ -474,18 +468,10 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
field3: 1,
field4: segno,
field5: 0,
field6: 0xffff_ffff,
field6: 0xffffffff,
}
}
pub fn is_slru_segment_size_key(key: &Key) -> bool {
key.field1 == 0x01
&& key.field2 < 0x03
&& key.field3 == 0x01
&& key.field5 == 0
&& key.field6 == u32::MAX
}
#[inline(always)]
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
let field2 = match kind {
@@ -701,15 +687,10 @@ mod tests {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let output_key = encoded_key.to_i128().to_be_bytes();
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
#[test]
fn test_possible_largest_key() {
Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
// TODO: put this key into the system and see if anything breaks.
}
}

View File

@@ -451,6 +451,8 @@ impl EvictionPolicy {
)]
#[strum(serialize_all = "kebab-case")]
pub enum CompactionAlgorithm {
#[strum(disabled)]
NotSpecified,
Legacy,
Tiered,
}

View File

@@ -3,7 +3,7 @@ use std::cmp::Ordering;
use std::fmt;
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
use postgres_ffi::relfile_utils::forknumber_to_name;
use postgres_ffi::Oid;
///
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
}
}
#[derive(Debug, thiserror::Error)]
pub enum ParseRelTagError {
#[error("invalid forknum")]
InvalidForknum(#[source] std::num::ParseIntError),
#[error("missing triplet member {}", .0)]
MissingTripletMember(usize),
#[error("invalid triplet member {}", .0)]
InvalidTripletMember(usize, #[source] std::num::ParseIntError),
}
impl std::str::FromStr for RelTag {
type Err = ParseRelTagError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
use ParseRelTagError::*;
// FIXME: in postgres logs this separator is dot
// Example:
// could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
// with a regex we could get this more painlessly
let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
Some((t, f)) => {
let forknum = forkname_to_number(Some(f));
let forknum = if let Ok(f) = forknum {
f
} else {
f.parse::<u8>().map_err(InvalidForknum)?
};
(t, Some(forknum))
}
None => (s, None),
};
let mut split = triplet
.splitn(3, '/')
.enumerate()
.map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
let spcnode = split.next().ok_or(MissingTripletMember(0))??;
let dbnode = split.next().ok_or(MissingTripletMember(1))??;
let relnode = split.next().ok_or(MissingTripletMember(2))??;
Ok(RelTag {
spcnode,
forknum: forknum.unwrap_or(MAIN_FORKNUM),
dbnode,
relnode,
})
}
}
impl RelTag {
pub fn to_segfile_name(&self, segno: u32) -> String {
let mut name = if self.spcnode == GLOBALTABLESPACE_OID {

View File

@@ -428,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
impl Default for ShardStripeSize {
fn default() -> Self {
DEFAULT_STRIPE_SIZE
}
}
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardLayout(u8);
@@ -719,25 +713,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
ShardNumber((hash % count.0 as u32) as u8)
}
/// For debugging, while not exposing the internals.
#[derive(Debug)]
#[allow(unused)] // used by debug formatting by pagectl
struct KeyShardingInfo {
shard0: bool,
shard_number: ShardNumber,
}
pub fn describe(
key: &Key,
shard_count: ShardCount,
stripe_size: ShardStripeSize,
) -> impl std::fmt::Debug {
KeyShardingInfo {
shard0: key_is_shard0(key),
shard_number: key_to_shard_number(shard_count, stripe_size, key),
}
}
#[cfg(test)]
mod tests {
use utils::Hex;

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
bytes.workspace = true
byteorder.workspace = true
itertools.workspace = true
pin-project-lite.workspace = true
postgres-protocol.workspace = true
rand.workspace = true

View File

@@ -7,9 +7,8 @@ pub mod framed;
use byteorder::{BigEndian, ReadBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, fmt, io, str};
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
// re-export for use in utils pageserver_feedback.rs
pub use postgres_protocol::PG_EPOCH;
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
},
}
#[derive(Debug, Clone, Default)]
pub struct StartupMessageParamsBuilder {
params: BytesMut,
}
impl StartupMessageParamsBuilder {
/// Set parameter's value by its name.
/// name and value must not contain a \0 byte
pub fn insert(&mut self, name: &str, value: &str) {
self.params.put(name.as_bytes());
self.params.put(&b"\0"[..]);
self.params.put(value.as_bytes());
self.params.put(&b"\0"[..]);
}
pub fn freeze(self) -> StartupMessageParams {
StartupMessageParams {
params: self.params.freeze(),
}
}
}
#[derive(Debug, Clone, Default)]
#[derive(Debug)]
pub struct StartupMessageParams {
params: Bytes,
params: HashMap<String, String>,
}
impl StartupMessageParams {
/// Get parameter's value by its name.
pub fn get(&self, name: &str) -> Option<&str> {
self.iter().find_map(|(k, v)| (k == name).then_some(v))
self.params.get(name).map(|s| s.as_str())
}
/// Split command-line options according to PostgreSQL's logic,
@@ -135,19 +112,15 @@ impl StartupMessageParams {
/// Iterate through key-value pairs in an arbitrary order.
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
let params =
std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
params.split_terminator('\0').tuples()
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
}
// This function is mostly useful in tests.
#[doc(hidden)]
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
let mut b = StartupMessageParamsBuilder::default();
for (k, v) in pairs {
b.insert(k, v)
Self {
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
}
b.freeze()
}
}
@@ -372,21 +345,35 @@ impl FeStartupPacket {
(major_version, minor_version) => {
// StartupMessage
let s = str::from_utf8(&msg).map_err(|_e| {
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
})?;
let s = s.strip_suffix('\0').ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?;
// Parse pairs of null-terminated strings (key, value).
// See `postgres: ProcessStartupPacket, build_startup_packet`.
let mut tokens = str::from_utf8(&msg)
.map_err(|_e| {
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
})?
.strip_suffix('\0') // drop packet's own null
.ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?
.split_terminator('\0');
let mut params = HashMap::new();
while let Some(name) = tokens.next() {
let value = tokens.next().ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: key without value".to_string(),
)
})?;
params.insert(name.to_owned(), value.to_owned());
}
FeStartupPacket::StartupMessage {
major_version,
minor_version,
params: StartupMessageParams {
params: msg.slice_ref(s.as_bytes()),
},
params: StartupMessageParams { params },
}
}
};

View File

@@ -26,14 +26,13 @@ use futures::stream::Stream;
use futures_util::StreamExt;
use futures_util::TryStreamExt;
use http_types::{StatusCode, Url};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
use crate::{
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
TimeTravelError, TimeoutOrCancel,
};
pub struct AzureBlobStorage {
@@ -138,8 +137,6 @@ impl AzureBlobStorage {
let mut last_modified = None;
let mut metadata = HashMap::new();
let started_at = start_measuring_requests(kind);
let download = async {
let response = builder
// convert to concrete Pageable
@@ -203,22 +200,13 @@ impl AzureBlobStorage {
})
};
let download = tokio::select! {
tokio::select! {
bufs = download => bufs,
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
},
};
let started_at = ScopeGuard::into_inner(started_at);
let outcome = match &download {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
download
}
}
async fn permit(
@@ -352,10 +340,7 @@ impl RemoteStorage for AzureBlobStorage {
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Put;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Put, cancel).await?;
let op = async {
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -379,25 +364,14 @@ impl RemoteStorage for AzureBlobStorage {
match fut.await {
Ok(Ok(_response)) => Ok(()),
Ok(Err(azure)) => Err(azure.into()),
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
}
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let outcome = match res {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
res
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
}
}
async fn download(
@@ -443,13 +417,12 @@ impl RemoteStorage for AzureBlobStorage {
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Delete, cancel).await?;
let op = async {
// TODO batch requests are not supported by the SDK
// TODO batch requests are also not supported by the SDK
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
for path in paths {
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
@@ -474,16 +447,10 @@ impl RemoteStorage for AzureBlobStorage {
Ok(())
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
}
}
async fn copy(
@@ -492,9 +459,7 @@ impl RemoteStorage for AzureBlobStorage {
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Copy, cancel).await?;
let timeout = tokio::time::sleep(self.timeout);
@@ -538,21 +503,15 @@ impl RemoteStorage for AzureBlobStorage {
}
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
_ = timeout => {
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
Err(e)
},
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
}
}
async fn time_travel_recover(

View File

@@ -12,7 +12,6 @@
mod azure_blob;
mod error;
mod local_fs;
mod metrics;
mod s3_bucket;
mod simulate_failures;
mod support;
@@ -122,8 +121,8 @@ impl RemotePath {
self.0.file_name()
}
pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
Self(self.0.join(path))
pub fn join(&self, segment: &Utf8Path) -> Self {
Self(self.0.join(segment))
}
pub fn get_path(&self) -> &Utf8PathBuf {

View File

@@ -46,16 +46,15 @@ use utils::backoff;
use super::StorageMetadata;
use crate::{
error::Cancelled,
metrics::{start_counting_cancelled_wait, start_measuring_requests},
support::PermitCarrying,
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use crate::metrics::AttemptOutcome;
pub(super) use crate::metrics::RequestKind;
pub(super) mod metrics;
use self::metrics::AttemptOutcome;
pub(super) use self::metrics::RequestKind;
/// AWS S3 storage.
pub struct S3Bucket {
@@ -228,7 +227,7 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
@@ -249,7 +248,7 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
@@ -288,7 +287,7 @@ impl S3Bucket {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
@@ -296,7 +295,7 @@ impl S3Bucket {
return Err(DownloadError::NotFound);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
@@ -372,12 +371,12 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp.context("request deletion")?;
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
/// Times and tracks the outcome of the request.
struct TimedDownload<S> {
started_at: std::time::Instant,
outcome: AttemptOutcome,
outcome: metrics::AttemptOutcome,
#[pin]
inner: S
}
impl<S> PinnedDrop for TimedDownload<S> {
fn drop(mut this: Pin<&mut Self>) {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
}
}
}
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
fn new(started_at: std::time::Instant, inner: S) -> Self {
TimedDownload {
started_at,
outcome: AttemptOutcome::Cancelled,
outcome: metrics::AttemptOutcome::Cancelled,
inner,
}
}
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
let res = ready!(this.inner.poll_next(cx));
match &res {
Some(Ok(_)) => {}
Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
None => *this.outcome = AttemptOutcome::Ok,
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
None => *this.outcome = metrics::AttemptOutcome::Ok,
}
Poll::Ready(res)
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
@@ -978,6 +977,28 @@ impl RemoteStorage for S3Bucket {
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
})
}
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
struct VerOrDelete {
kind: VerOrDeleteKind,

View File

@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
TimeTravel = 5,
}
use scopeguard::ScopeGuard;
use RequestKind::*;
impl RequestKind {
@@ -34,10 +33,10 @@ impl RequestKind {
}
}
pub(crate) struct RequestTyped<C>([C; 6]);
pub(super) struct RequestTyped<C>([C; 6]);
impl<C> RequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind) -> &C {
pub(super) fn get(&self, kind: RequestKind) -> &C {
&self.0[kind.as_index()]
}
@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
}
impl RequestTyped<Histogram> {
pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
self.get(kind).observe(started_at.elapsed().as_secs_f64())
}
}
pub(crate) struct PassFailCancelledRequestTyped<C> {
pub(super) struct PassFailCancelledRequestTyped<C> {
success: RequestTyped<C>,
fail: RequestTyped<C>,
cancelled: RequestTyped<C>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum AttemptOutcome {
pub(super) enum AttemptOutcome {
Ok,
Err,
Cancelled,
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
}
impl AttemptOutcome {
pub(crate) fn as_str(&self) -> &'static str {
pub(super) fn as_str(&self) -> &'static str {
match self {
AttemptOutcome::Ok => "ok",
AttemptOutcome::Err => "err",
@@ -97,7 +96,7 @@ impl AttemptOutcome {
}
impl<C> PassFailCancelledRequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
let target = match outcome {
AttemptOutcome::Ok => &self.success,
AttemptOutcome::Err => &self.fail,
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
}
impl PassFailCancelledRequestTyped<Histogram> {
pub(crate) fn observe_elapsed(
pub(super) fn observe_elapsed(
&self,
kind: RequestKind,
outcome: impl Into<AttemptOutcome>,
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
}
}
/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
pub(crate) fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
crate::metrics::BUCKET_METRICS
.cancelled_waits
.get(kind)
.inc()
})
}
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
pub(crate) fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
pub(crate) struct BucketMetrics {
pub(super) struct BucketMetrics {
/// Full request duration until successful completion, error or cancellation.
pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
/// Total amount of seconds waited on queue.
pub(crate) wait_seconds: RequestTyped<Histogram>,
pub(super) wait_seconds: RequestTyped<Histogram>,
/// Track how many semaphore awaits were cancelled per request type.
///
/// This is in case cancellations are happening more than expected.
pub(crate) cancelled_waits: RequestTyped<IntCounter>,
pub(super) cancelled_waits: RequestTyped<IntCounter>,
/// Total amount of deleted objects in batches or single requests.
pub(crate) deleted_objects_total: IntCounter,
pub(super) deleted_objects_total: IntCounter,
}
impl Default for BucketMetrics {

View File

@@ -19,13 +19,13 @@
/// // right: [0x68; 1]
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
/// ```
pub struct Hex<S>(pub S);
#[derive(PartialEq)]
pub struct Hex<'a>(pub &'a [u8]);
impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
impl std::fmt::Debug for Hex<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[")?;
let chunks = self.0.as_ref().chunks(16);
for (i, c) in chunks.enumerate() {
for (i, c) in self.0.chunks(16).enumerate() {
if i > 0 && !c.is_empty() {
writeln!(f, ", ")?;
}
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
write!(f, "0x{b:02x}")?;
}
}
write!(f, "; {}]", self.0.as_ref().len())
}
}
impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
fn eq(&self, other: &Hex<R>) -> bool {
let left = self.0.as_ref();
let right = other.0.as_ref();
left == right
write!(f, "; {}]", self.0.len())
}
}

View File

@@ -17,7 +17,6 @@ pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true

View File

@@ -1,477 +0,0 @@
use anyhow::Context;
use clap::Parser;
use pageserver_api::{
key::Key,
reltag::{BlockNumber, RelTag, SlruKind},
shard::{ShardCount, ShardStripeSize},
};
use std::str::FromStr;
#[derive(Parser)]
pub(super) struct DescribeKeyCommand {
/// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
input: Vec<String>,
/// The number of shards to calculate what Keys placement would be.
#[arg(long)]
shard_count: Option<CustomShardCount>,
/// The sharding stripe size.
///
/// The default is hardcoded. It makes no sense to provide this without providing
/// `--shard-count`.
#[arg(long, requires = "shard_count")]
stripe_size: Option<u32>,
}
/// Sharded shard count without unsharded count, which the actual ShardCount supports.
#[derive(Clone, Copy)]
pub(super) struct CustomShardCount(std::num::NonZeroU8);
#[derive(Debug, thiserror::Error)]
pub(super) enum InvalidShardCount {
#[error(transparent)]
ParsingFailed(#[from] std::num::ParseIntError),
#[error("too few shards")]
TooFewShards,
}
impl FromStr for CustomShardCount {
type Err = InvalidShardCount;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let inner: std::num::NonZeroU8 = s.parse()?;
if inner.get() < 2 {
Err(InvalidShardCount::TooFewShards)
} else {
Ok(CustomShardCount(inner))
}
}
}
impl From<CustomShardCount> for ShardCount {
fn from(value: CustomShardCount) -> Self {
ShardCount::new(value.0.get())
}
}
impl DescribeKeyCommand {
pub(super) fn execute(self) {
let DescribeKeyCommand {
input,
shard_count,
stripe_size,
} = self;
let material = KeyMaterial::try_from(input.as_slice()).unwrap();
let kind = material.kind();
let key = Key::from(material);
println!("parsed from {kind}: {key}:");
println!();
println!("{key:?}");
macro_rules! kind_query {
($name:ident) => {{
let s: &'static str = stringify!($name);
let s = s.strip_prefix("is_").unwrap_or(s);
let s = s.strip_suffix("_key").unwrap_or(s);
#[allow(clippy::needless_borrow)]
(s, pageserver_api::key::$name(key))
}};
}
// the current characterization is a mess of these boolean queries and separate
// "recognization". I think it accurately represents how strictly we model the Key
// right now, but could of course be made less confusing.
let queries = [
("rel_block", pageserver_api::key::is_rel_block_key(&key)),
kind_query!(is_rel_vm_block_key),
kind_query!(is_rel_fsm_block_key),
kind_query!(is_slru_block_key),
kind_query!(is_inherited_key),
("rel_size", pageserver_api::key::is_rel_size_key(&key)),
(
"slru_segment_size",
pageserver_api::key::is_slru_segment_size_key(&key),
),
];
let recognized_kind = "recognized kind";
let metadata_key = "metadata key";
let shard_placement = "shard placement";
let longest = queries
.iter()
.map(|t| t.0)
.chain([recognized_kind, metadata_key, shard_placement])
.map(|s| s.len())
.max()
.unwrap();
let colon = 1;
let padding = 1;
for (name, is) in queries {
let width = longest - name.len() + colon + padding;
println!("{}{:width$}{}", name, ":", is);
}
let width = longest - recognized_kind.len() + colon + padding;
println!(
"{}{:width$}{:?}",
recognized_kind,
":",
RecognizedKeyKind::new(key),
);
if let Some(shard_count) = shard_count {
// seeing the sharding placement might be confusing, so leave it out unless shard
// count was given.
let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
println!(
"# placement with shard_count: {} and stripe_size: {}:",
shard_count.0, stripe_size.0
);
let width = longest - shard_placement.len() + colon + padding;
println!(
"{}{:width$}{:?}",
shard_placement,
":",
pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
);
}
}
}
/// Hand-wavy "inputs we accept" for a key.
#[derive(Debug)]
pub(super) enum KeyMaterial {
Hex(Key),
String(SpanAttributesFromLogs),
Split(RelTag, BlockNumber),
}
impl KeyMaterial {
fn kind(&self) -> &'static str {
match self {
KeyMaterial::Hex(_) => "hex",
KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
}
}
}
impl From<KeyMaterial> for Key {
fn from(value: KeyMaterial) -> Self {
match value {
KeyMaterial::Hex(key) => key,
KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
| KeyMaterial::Split(rt, blocknum) => {
pageserver_api::key::rel_block_to_key(rt, blocknum)
}
}
}
}
impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
type Error = anyhow::Error;
fn try_from(value: &[S]) -> Result<Self, Self::Error> {
match value {
[] => anyhow::bail!(
"need 1..N positional arguments describing the key, try hex or a log line"
),
[one] => {
let one = one.as_ref();
let key = Key::from_hex(one).map(KeyMaterial::Hex);
let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
match (key, attrs) {
(Ok(key), _) => Ok(key),
(_, Ok(s)) => Ok(s),
(Err(e1), Err(e2)) => anyhow::bail!(
"failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
),
}
}
more => {
// assume going left to right one of these is a reltag and then we find a blocknum
// this works, because we don't have plain numbers at least right after reltag in
// logs. for some definition of "works".
let Some((reltag_at, reltag)) = more
.iter()
.map(AsRef::as_ref)
.enumerate()
.find_map(|(i, s)| {
s.split_once("rel=")
.map(|(_garbage, actual)| actual)
.unwrap_or(s)
.parse::<RelTag>()
.ok()
.map(|rt| (i, rt))
})
else {
anyhow::bail!("found no RelTag in arguments");
};
let Some(blocknum) = more
.iter()
.map(AsRef::as_ref)
.skip(reltag_at)
.find_map(|s| {
s.split_once("blkno=")
.map(|(_garbage, actual)| actual)
.unwrap_or(s)
.parse::<BlockNumber>()
.ok()
})
else {
anyhow::bail!("found no blocknum in arguments");
};
Ok(KeyMaterial::Split(reltag, blocknum))
}
}
}
}
#[derive(Debug)]
pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
impl std::str::FromStr for SpanAttributesFromLogs {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// accept the span separator but do not require or fail if either is missing
// "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
let (_, reltag) = s
.split_once("rel=")
.ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
let reltag = reltag.split_whitespace().next().unwrap();
let (_, blocknum) = s
.split_once("blkno=")
.ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
let blocknum = blocknum.split_whitespace().next().unwrap();
let reltag = reltag
.parse()
.with_context(|| format!("parse reltag from {reltag:?}"))?;
let blocknum = blocknum
.parse()
.with_context(|| format!("parse blocknum from {blocknum:?}"))?;
Ok(Self(reltag, blocknum))
}
}
#[derive(Debug)]
#[allow(dead_code)] // debug print is used
enum RecognizedKeyKind {
DbDir,
ControlFile,
Checkpoint,
AuxFilesV1,
SlruDir(Result<SlruKind, u32>),
RelMap(RelTagish<2>),
RelDir(RelTagish<2>),
AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
}
#[derive(Debug, PartialEq)]
#[allow(unused)]
enum AuxFileV2 {
Recognized(&'static str, utils::Hex<[u8; 13]>),
OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
Other(utils::Hex<[u8; 13]>),
}
impl RecognizedKeyKind {
fn new(key: Key) -> Option<Self> {
use RecognizedKeyKind::{
AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
};
let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
Some(match key {
pageserver_api::key::DBDIR_KEY => DbDir,
pageserver_api::key::CONTROLFILE_KEY => ControlFile,
pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
_ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
RelMap([key.field2, key.field3].into())
}
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
RelDir([key.field2, key.field3].into())
}
_ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
),
_ => return None,
})
}
}
impl AuxFileV2 {
fn new(key: Key) -> Option<AuxFileV2> {
const EMPTY_HASH: [u8; 13] = {
let mut out = [0u8; 13];
let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
let mut i = 3;
while i < 16 {
out[i - 3] = hash[i];
i += 1;
}
out
};
let bytes = key.to_i128().to_be_bytes();
let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
assert_eq!(EMPTY_HASH.len(), hash.0.len());
// TODO: we could probably find the preimages for the hashes
Some(match (bytes[1], bytes[2]) {
(1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
(1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
(1, 3) if hash.0 == EMPTY_HASH => {
AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
}
(2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
(1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
(0xff, 0xff) => AuxFileV2::Other(hash),
_ => return None,
})
}
}
/// Prefix of RelTag, currently only known use cases are the two item versions.
///
/// Renders like a reltag with `/`, nothing else.
struct RelTagish<const N: usize>([u32; N]);
impl<const N: usize> From<[u32; N]> for RelTagish<N> {
fn from(val: [u32; N]) -> Self {
RelTagish(val)
}
}
impl<const N: usize> std::fmt::Debug for RelTagish<N> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use std::fmt::Write as _;
let mut first = true;
self.0.iter().try_for_each(|x| {
if !first {
f.write_char('/')?;
}
first = false;
write!(f, "{}", x)
})
}
}
#[cfg(test)]
mod tests {
use pageserver::aux_file::encode_aux_file_key;
use super::*;
#[test]
fn hex_is_key_material() {
let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
}
#[test]
fn single_positional_spanalike_is_key_material() {
// why is this needed? if you are checking many, then copypaste starts to appeal
let strings = [
(line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
(line!(), "rel=1663/208101/2620_fsm blkno=2"),
(line!(), "rel=1663/208101/2620.1 blkno=2"),
];
let mut first: Option<Key> = None;
for (line, example) in strings {
let m = KeyMaterial::try_from(&[example][..])
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
let key = Key::from(m);
if let Some(first) = first {
assert_eq!(first, key);
} else {
first = Some(key);
}
}
// not supporting this is rather accidential, but I think the input parsing is lenient
// enough already
KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
}
#[test]
fn multiple_spanlike_args() {
let strings = [
(line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
(line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
(line!(), &["1663/208101/2620_fsm", "2"][..]),
];
let mut first: Option<Key> = None;
for (line, example) in strings {
let m = KeyMaterial::try_from(example)
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
let key = Key::from(m);
if let Some(first) = first {
assert_eq!(first, key);
} else {
first = Some(key);
}
}
}
#[test]
fn recognized_auxfiles() {
use AuxFileV2::*;
let empty = [
0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
];
let foobar = [
0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
];
#[rustfmt::skip]
let examples = [
(line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
(line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
(line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
(line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
(line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
(line!(), "foobar", Other(utils::Hex(foobar))),
];
for (line, path, expected) in examples {
let key = encode_aux_file_key(path);
let recognized =
AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
assert_eq!(recognized, expected);
}
assert_eq!(
AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
None,
"example key has one too few 0 after 6 before 1"
);
}
}

View File

@@ -6,7 +6,6 @@
mod draw_timeline_dir;
mod index_part;
mod key;
mod layer_map_analyzer;
mod layers;
@@ -62,8 +61,6 @@ enum Commands {
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
Layer(LayerCmd),
/// Debug print a hex key found from logs
Key(key::DescribeKeyCommand),
}
/// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
Commands::Key(dkc) => dkc.execute(),
};
Ok(())
}

View File

@@ -5,7 +5,6 @@ use utils::lsn::Lsn;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
/// Ingest aux files into the pageserver.
#[derive(clap::Parser)]
@@ -89,17 +88,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
println!("ingested {file_cnt} files");
}
for _ in 0..100 {
let start = Instant::now();
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!(
"{} files found in {}s",
files.len(),
start.elapsed().as_secs_f64()
);
}
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!("{} files found", files.len());
anyhow::Ok(())
}

View File

@@ -5,7 +5,7 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::shard::TenantShardId;
use pageserver_api::{models::CompactionAlgorithm, shard::TenantShardId};
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde;
use serde::de::IntoDeserializer;
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
use once_cell::sync::{Lazy, OnceCell};
use reqwest::Url;
use std::num::NonZeroUsize;
use std::str::FromStr;
@@ -1067,6 +1067,19 @@ impl PageServerConf {
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
{
const VAR_NAME: &str = "NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM";
static VAR: Lazy<Option<bool>> = Lazy::new(|| utils::env::var(VAR_NAME));
if VAR.unwrap_or(false)
&& conf.default_tenant_conf.compaction_algorithm.kind
== CompactionAlgorithm::NotSpecified
{
panic!(
"Unspecified compaction algorithm in default tenant configuration. \
Set the algorithm explicitly in the pageserver.toml's `tenant_config` field or unset the environment variable {VAR_NAME}");
}
}
Ok(conf)
}

View File

@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
// mean the synthetic size worker should terminate.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled)
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {

View File

@@ -311,7 +311,7 @@ impl DeletionList {
result.extend(
timeline_layers
.into_iter()
.map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
.map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
);
}
}

View File

@@ -74,7 +74,6 @@ use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::SpawnMode;
@@ -184,6 +183,9 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}").into())
}
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
@@ -1811,22 +1813,11 @@ async fn timeline_checkpoint_handler(
timeline
.freeze_and_flush()
.await
.map_err(|e| {
match e {
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(other.into()),
}
})?;
.map_err(ApiError::InternalServerError)?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;

View File

@@ -66,7 +66,6 @@ use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::FlushLayerError;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
@@ -373,7 +372,7 @@ impl From<WaitLsnError> for PageStreamError {
match value {
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
WaitLsnError::Shutdown => Self::Shutdown,
e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
}
}
}
@@ -383,7 +382,7 @@ impl From<WaitLsnError> for QueryError {
match value {
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState { .. } => Self::Reconnect,
WaitLsnError::BadState => Self::Reconnect,
}
}
}
@@ -831,10 +830,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.freeze_and_flush().await.map_err(|e| match e {
FlushLayerError::Cancelled => QueryError::Shutdown,
other => QueryError::Other(other.into()),
})?;
timeline.freeze_and_flush().await?;
info!("done");
Ok(())

View File

@@ -78,19 +78,11 @@ pub enum LsnForTimestamp {
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CalculateLogicalSizeError {
pub enum CalculateLogicalSizeError {
#[error("cancelled")]
Cancelled,
/// Something went wrong while reading the metadata we use to calculate logical size
/// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
/// in the `From` implementation for this variant.
#[error(transparent)]
PageRead(PageReconstructError),
/// Something went wrong deserializing metadata that we read to calculate logical size
#[error("decode error: {0}")]
Decode(#[from] DeserializeError),
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
@@ -115,8 +107,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
PageReconstructError::Cancelled => Self::Cancelled,
_ => Self::PageRead(pre),
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
Self::Cancelled
}
_ => Self::Other(pre.into()),
}
}
}
@@ -769,7 +763,7 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn get_current_logical_size_non_incremental(
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -778,7 +772,7 @@ impl Timeline {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf)?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -1558,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
(None, true) => warn!("removing non-existing aux file: {}", path),
(None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
}
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
@@ -1612,7 +1606,8 @@ impl<'a> DatadirModification<'a> {
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::Cancelled
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby

View File

@@ -487,33 +487,6 @@ enum CreateTimelineCause {
Delete,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GcError {
// The tenant is shutting down
#[error("tenant shutting down")]
TenantCancelled,
// The tenant is shutting down
#[error("timeline shutting down")]
TimelineCancelled,
// The tenant is in a state inelegible to run GC
#[error("not active")]
NotActive,
// A requested GC cutoff LSN was invalid, for example it tried to move backwards
#[error("not active")]
BadLsn { why: String },
// A remote storage error while scheduling updates after compaction
#[error(transparent)]
Remote(anyhow::Error),
// If GC was invoked for a particular timeline, this error means it didn't exist
#[error("timeline not found")]
TimelineNotFound,
}
impl Tenant {
/// Yet another helper for timeline initialization.
///
@@ -1420,36 +1393,6 @@ impl Tenant {
Ok(tl)
}
/// Helper for unit tests to create a timeline with some pre-loaded states.
#[cfg(test)]
#[allow(clippy::too_many_arguments)]
pub async fn create_test_timeline_with_layers(
&self,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
ctx: &RequestContext,
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
let tline = self
.create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
.await?;
tline.force_advance_lsn(end_lsn);
for deltas in delta_layer_desc {
tline
.force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
.await?;
}
for (lsn, images) in image_layer_desc {
tline
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
.await?;
}
Ok(tline)
}
/// Create a new timeline.
///
/// Returns the new timeline ID and reference to its Timeline object.
@@ -1564,7 +1507,7 @@ impl Tenant {
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
}
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -1632,23 +1575,24 @@ impl Tenant {
/// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
/// requires more history to be retained.
//
pub(crate) async fn gc_iteration(
pub async fn gc_iteration(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<GcResult, GcError> {
) -> anyhow::Result<GcResult> {
// Don't start doing work during shutdown
if let TenantState::Stopping { .. } = self.current_state() {
return Ok(GcResult::default());
}
// there is a global allowed_error for this
if !self.is_active() {
return Err(GcError::NotActive);
}
anyhow::ensure!(
self.is_active(),
"Cannot run GC iteration on inactive tenant"
);
{
let conf = self.tenant_conf.load();
@@ -2816,13 +2760,28 @@ impl Tenant {
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<GcResult, GcError> {
) -> anyhow::Result<GcResult> {
let mut totals: GcResult = Default::default();
let now = Instant::now();
let gc_timelines = self
let gc_timelines = match self
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
.await?;
.await
{
Ok(result) => result,
Err(e) => {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
// Handle cancellation
totals.elapsed = now.elapsed();
return Ok(totals);
} else {
// Propagate other errors
return Err(e);
}
}
};
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -2847,19 +2806,7 @@ impl Tenant {
// made.
break;
}
let result = match timeline.gc().await {
Err(GcError::TimelineCancelled) => {
if target_timeline_id.is_some() {
// If we were targetting this specific timeline, surface cancellation to caller
return Err(GcError::TimelineCancelled);
} else {
// A timeline may be shutting down independently of the tenant's lifecycle: we should
// skip past this and proceed to try GC on other timelines.
continue;
}
}
r => r?,
};
let result = timeline.gc().await?;
totals += result;
}
@@ -2872,11 +2819,11 @@ impl Tenant {
/// [`Tenant::get_gc_horizon`].
///
/// This is usually executed as part of periodic gc, but can now be triggered more often.
pub(crate) async fn refresh_gc_info(
pub async fn refresh_gc_info(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<Vec<Arc<Timeline>>, GcError> {
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// since this method can now be called at different rates than the configured gc loop, it
// might be that these configuration values get applied faster than what it was previously,
// since these were only read from the gc task.
@@ -2897,7 +2844,7 @@ impl Tenant {
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<Vec<Arc<Timeline>>, GcError> {
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
// currently visible timelines.
let timelines = self
@@ -2934,8 +2881,8 @@ impl Tenant {
}
}
if !self.is_active() || self.cancel.is_cancelled() {
return Err(GcError::TenantCancelled);
if !self.is_active() {
anyhow::bail!("shutting down");
}
// grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2944,19 +2891,19 @@ impl Tenant {
// Scan all timelines. For each timeline, remember the timeline ID and
// the branch point where it was created.
let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
let timelines = self.timelines.lock().unwrap();
let mut all_branchpoints = BTreeSet::new();
let timelines = {
let timeline_ids = {
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
if timelines.get(target_timeline_id).is_none() {
return Err(GcError::TimelineNotFound);
bail!("gc target timeline does not exist")
}
};
timelines
.iter()
.map(|(_timeline_id, timeline_entry)| {
.map(|(timeline_id, timeline_entry)| {
if let Some(ancestor_timeline_id) =
&timeline_entry.get_ancestor_timeline_id()
{
@@ -2978,28 +2925,33 @@ impl Tenant {
}
}
timeline_entry.clone()
*timeline_id
})
.collect::<Vec<_>>()
};
(all_branchpoints, timelines)
(all_branchpoints, timeline_ids)
};
// Ok, we now know all the branch points.
// Update the GC information for each timeline.
let mut gc_timelines = Vec::with_capacity(timelines.len());
for timeline in timelines {
let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
for timeline_id in timeline_ids {
// Timeline is known to be local and loaded.
let timeline = self
.get_timeline(timeline_id, false)
.with_context(|| format!("Timeline {timeline_id} was not found"))?;
// If target_timeline is specified, ignore all other timelines
if let Some(target_timeline_id) = target_timeline_id {
if timeline.timeline_id != target_timeline_id {
if timeline_id != target_timeline_id {
continue;
}
}
let branchpoints: Vec<Lsn> = all_branchpoints
.range((
Included((timeline.timeline_id, Lsn(0))),
Included((timeline.timeline_id, Lsn(u64::MAX))),
Included((timeline_id, Lsn(0))),
Included((timeline_id, Lsn(u64::MAX))),
))
.map(|&x| x.1)
.collect();
@@ -3007,7 +2959,7 @@ impl Tenant {
{
let mut target = timeline.gc_info.write().unwrap();
match gc_cutoffs.remove(&timeline.timeline_id) {
match gc_cutoffs.remove(&timeline_id) {
Some(cutoffs) => {
*target = GcInfo {
retain_lsns: branchpoints,
@@ -3040,53 +2992,17 @@ impl Tenant {
&self,
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
ancestor_lsn: Option<Lsn>,
start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
let tl = self
.branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
.await?;
tl.set_state(TimelineState::Active);
Ok(tl)
}
/// Helper for unit tests to branch a timeline with some pre-loaded states.
#[cfg(test)]
#[allow(clippy::too_many_arguments)]
pub async fn branch_timeline_test_with_layers(
&self,
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
ancestor_lsn: Option<Lsn>,
ctx: &RequestContext,
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
let tline = self
.branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
.await?;
let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
ancestor_lsn
} else {
tline.get_last_record_lsn()
};
assert!(end_lsn >= ancestor_lsn);
tline.force_advance_lsn(end_lsn);
for deltas in delta_layer_desc {
tline
.force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
.await?;
}
for (lsn, images) in image_layer_desc {
tline
.force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
.await?;
}
Ok(tline)
}
/// Branch an existing timeline.
///
/// The caller is responsible for activating the returned timeline.
@@ -4238,7 +4154,7 @@ mod tests {
.await?;
writer.finish_write(lsn);
}
tline.freeze_and_flush().await.map_err(|e| e.into())
tline.freeze_and_flush().await
}
#[tokio::test]
@@ -4392,10 +4308,9 @@ mod tests {
// This needs to traverse to the parent, and fails.
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
assert!(err.to_string().starts_with(&format!(
"Bad state on timeline {}: Broken",
tline.timeline_id
)));
assert!(err
.to_string()
.contains("will not become active. Current state: Broken"));
Ok(())
}
@@ -6290,36 +6205,75 @@ mod tests {
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let cancel = CancellationToken::new();
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // delta layers
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
)
.await?;
let mut lsn = Lsn(0x20);
{
let mut writer = tline.writer().await;
writer
.put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
.await?;
writer.finish_write(lsn);
drop(writer);
tline.freeze_and_flush().await?; // this will create a image layer
}
let child = tenant
.branch_timeline_test_with_layers(
&tline,
NEW_TIMELINE_ID,
Some(Lsn(0x20)),
&ctx,
Vec::new(), // delta layers
vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
Lsn(0x30),
)
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
.await
.unwrap();
lsn.0 += 0x10;
{
let mut writer = child.writer().await;
writer
.put(
base_key_child,
lsn,
&Value::Image(test_img("data key 2")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
child.freeze_and_flush().await?; // this will create a delta
{
// update the partitioning to include the test key space, otherwise they
// will be dropped by image layer creation
let mut guard = child.partitioning.lock().await;
let ((partitioning, _), partition_lsn) = &mut *guard;
partitioning
.parts
.push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
*partition_lsn = lsn;
}
child
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set
},
&ctx,
)
.await?; // force create an image layer for the keys, TODO: check if the image layer is created
}
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
@@ -6341,8 +6295,6 @@ mod tests {
}))
}
let lsn = Lsn(0x30);
// test vectored get on parent timeline
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6380,42 +6332,94 @@ mod tests {
#[tokio::test]
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
let (tenant, ctx) = harness.load().await;
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // delta layers
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
)
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let cancel = CancellationToken::new();
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
base_key_child.field1 = AUX_KEY_PREFIX;
base_key_nonexist.field1 = AUX_KEY_PREFIX;
let mut lsn = Lsn(0x20);
{
let mut writer = tline.writer().await;
writer
.put(
base_key,
lsn,
&Value::Image(test_img("metadata key 1")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
tline.freeze_and_flush().await?; // this will create an image layer
tline
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set.insert(CompactFlags::ForceRepartition);
set
},
&ctx,
)
.await?; // force create an image layer for metadata keys
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
let child = tenant
.branch_timeline_test_with_layers(
&tline,
NEW_TIMELINE_ID,
Some(Lsn(0x20)),
&ctx,
Vec::new(), // delta layers
vec![(
Lsn(0x30),
vec![(base_key_child, test_img("metadata key 2"))],
)], // image layers
Lsn(0x30),
)
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
.await
.unwrap();
lsn.0 += 0x10;
{
let mut writer = child.writer().await;
writer
.put(
base_key_child,
lsn,
&Value::Image(test_img("metadata key 2")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
child.freeze_and_flush().await?;
child
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set.insert(CompactFlags::ForceRepartition);
set
},
&ctx,
)
.await?; // force create an image layer for metadata keys
tenant
.gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
@@ -6437,8 +6441,6 @@ mod tests {
}))
}
let lsn = Lsn(0x30);
// test vectored get on parent timeline
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,

View File

@@ -40,8 +40,6 @@ pub mod defaults {
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
super::CompactionAlgorithm::Legacy;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
@@ -554,7 +552,13 @@ impl Default for TenantConf {
.expect("cannot parse default compaction period"),
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
compaction_algorithm: CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
kind: if cfg!(test) {
// Rust tests rely on a valid implicit default (TODO: fix this)
CompactionAlgorithm::Legacy
} else {
// Python tests are subject to NotSpecified handling
CompactionAlgorithm::NotSpecified
},
},
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)

View File

@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
@@ -2833,13 +2833,7 @@ pub(crate) async fn immediate_gc(
}
}
result.map_err(|e| match e {
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
GcError::TimelineNotFound => {
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
}
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})
result.map_err(ApiError::InternalServerError)
}
#[cfg(test)]

View File

@@ -187,7 +187,6 @@ impl SecondaryTenant {
};
let now = SystemTime::now();
tracing::info!("Evicting secondary layer");
let this = self.clone();

View File

@@ -909,7 +909,6 @@ impl<'a> TenantDownloader<'a> {
strftime(&layer.access_time),
strftime(evicted_at)
);
self.skip_layer(layer);
continue;
}
}
@@ -964,15 +963,6 @@ impl<'a> TenantDownloader<'a> {
Ok(())
}
/// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
fn skip_layer(&self, layer: HeatMapLayer) {
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.layers_total = progress.layers_total.saturating_sub(1);
progress.bytes_total = progress
.bytes_total
.saturating_sub(layer.metadata.file_size);
}
async fn download_layer(
&self,
tenant_shard_id: &TenantShardId,
@@ -1022,7 +1012,13 @@ impl<'a> TenantDownloader<'a> {
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
self.skip_layer(layer);
// If the layer is 404, adjust the progress statistics to reflect that we will not download it.
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.layers_total = progress.layers_total.saturating_sub(1);
progress.bytes_total = progress
.bytes_total
.saturating_sub(layer.metadata.file_size);
return Ok(None);
}

View File

@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: Vec<KeySpace>,
target_keyspace: KeySpace,
}
impl LayerFringe {
@@ -336,7 +336,6 @@ impl LayerFringe {
};
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((
_,
@@ -344,15 +343,7 @@ impl LayerFringe {
layer,
target_keyspace,
},
)) => {
let mut keyspace = KeySpaceRandomAccum::new();
for ks in target_keyspace {
for part in ks.ranges {
keyspace.add_range(part);
}
}
Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
}
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
None => unreachable!("fringe internals are always consistent"),
}
}
@@ -367,7 +358,7 @@ impl LayerFringe {
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.push(keyspace);
entry.get_mut().target_keyspace.merge(&keyspace);
}
Entry::Vacant(entry) => {
self.planned_reads_by_lsn.push(ReadDesc {
@@ -376,7 +367,7 @@ impl LayerFringe {
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: vec![keyspace],
target_keyspace: keyspace,
});
}
}

View File

@@ -366,10 +366,7 @@ impl Layer {
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| match err {
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
self.0
.access_stats
@@ -1161,11 +1158,6 @@ impl LayerInner {
let consecutive_failures =
1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
if timeline.cancel.is_cancelled() {
// If we're shutting down, drop out before logging the error
return Err(e);
}
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
let backoff = utils::backoff::exponential_backoff_duration_seconds(

View File

@@ -380,28 +380,21 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let res = tenant
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
.await;
match res {
Ok(_) => {
error_run_count = 0;
period
}
Err(crate::tenant::GcError::TenantCancelled) => {
return;
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
if let Err(e) = res {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
}
wait_duration
} else {
error_run_count = 0;
period
}
};

View File

@@ -131,17 +131,14 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) enum FlushLoopState {
pub(super) enum FlushLoopState {
NotStarted,
Running {
#[cfg(test)]
@@ -499,11 +496,15 @@ pub(crate) enum PageReconstructError {
Other(#[from] anyhow::Error),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(WaitLsnError),
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("timeline shutting down")]
Cancelled,
/// The ancestor of this is being stopped
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
/// An error happened replaying WAL records
#[error(transparent)]
WalRedo(anyhow::Error),
@@ -568,7 +569,7 @@ impl PageReconstructError {
match self {
Other(_) => false,
AncestorLsnTimeout(_) => false,
Cancelled => true,
Cancelled | AncestorStopping(_) => true,
WalRedo(_) => false,
MissingKey { .. } => false,
}
@@ -576,7 +577,7 @@ impl PageReconstructError {
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum CreateImageLayersError {
enum CreateImageLayersError {
#[error("timeline shutting down")]
Cancelled,
@@ -590,35 +591,17 @@ pub(crate) enum CreateImageLayersError {
Other(#[from] anyhow::Error),
}
#[derive(thiserror::Error, Debug, Clone)]
pub(crate) enum FlushLayerError {
#[derive(thiserror::Error, Debug)]
enum FlushLayerError {
/// Timeline cancellation token was cancelled
#[error("timeline shutting down")]
Cancelled,
/// We tried to flush a layer while the Timeline is in an unexpected state
#[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
NotRunning(FlushLoopState),
// Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
// loop via a watch channel, where we can only borrow it.
#[error(transparent)]
CreateImageLayersError(Arc<CreateImageLayersError>),
CreateImageLayersError(CreateImageLayersError),
#[error(transparent)]
Other(#[from] Arc<anyhow::Error>),
}
impl FlushLayerError {
// When crossing from generic anyhow errors to this error type, we explicitly check
// for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
if timeline.cancel.is_cancelled() {
Self::Cancelled
} else {
Self::Other(Arc::new(err))
}
}
Other(#[from] anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
@@ -644,17 +627,17 @@ pub(crate) enum GetVectoredError {
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetReadyAncestorError {
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("Bad state on timeline {timeline_id}: {state:?}")]
BadState {
timeline_id: TimelineId,
state: TimelineState,
},
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
#[derive(Clone, Copy)]
@@ -689,8 +672,8 @@ pub(crate) enum WaitLsnError {
Shutdown,
// Called on an timeline not in active state or shutting down
#[error("Bad timeline state: {0:?}")]
BadState(TimelineState),
#[error("Bad state (not active)")]
BadState,
// Timeout expired while waiting for LSN to catch up with goal.
#[error("{0}")]
@@ -713,7 +696,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
fn from(e: CreateImageLayersError) -> Self {
match e {
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
any => FlushLayerError::CreateImageLayersError(any),
}
}
}
@@ -753,9 +736,10 @@ impl From<GetReadyAncestorError> for PageReconstructError {
fn from(e: GetReadyAncestorError) -> Self {
use GetReadyAncestorError::*;
match e {
AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
Cancelled => PageReconstructError::Cancelled,
Other(other) => PageReconstructError::Other(other),
}
}
}
@@ -1187,7 +1171,9 @@ impl Timeline {
use PageReconstructError::*;
match block {
Err(Cancelled) => return Err(GetVectoredError::Cancelled),
Err(Cancelled | AncestorStopping(_)) => {
return Err(GetVectoredError::Cancelled)
}
Err(MissingKey(_))
if NON_INHERITED_RANGE.contains(&key)
|| NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -1462,11 +1448,10 @@ impl Timeline {
who_is_waiting: WaitLsnWaiter<'_>,
ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
let state = self.current_state();
if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
if self.cancel.is_cancelled() {
return Err(WaitLsnError::Shutdown);
} else if !matches!(state, TimelineState::Active) {
return Err(WaitLsnError::BadState(state));
} else if !self.is_active() {
return Err(WaitLsnError::BadState);
}
if cfg!(debug_assertions) {
@@ -1562,13 +1547,13 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
self.freeze_and_flush0().await
}
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
}
@@ -1715,6 +1700,9 @@ impl Timeline {
}
match self.get_compaction_algorithm_settings().kind {
CompactionAlgorithm::NotSpecified => {
unreachable!("should panic earlier when we construct the default tenant conf")
}
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
}
@@ -2750,6 +2738,11 @@ impl Timeline {
self.current_logical_size.initialized.add_permits(1);
}
enum BackgroundCalculationError {
Cancelled,
Other(anyhow::Error),
}
let try_once = |attempt: usize| {
let background_ctx = &background_ctx;
let self_ref = &self;
@@ -2767,10 +2760,10 @@ impl Timeline {
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
}
_ = self_ref.cancel.cancelled() => {
return Err(CalculateLogicalSizeError::Cancelled);
return Err(BackgroundCalculationError::Cancelled);
}
_ = cancel.cancelled() => {
return Err(CalculateLogicalSizeError::Cancelled);
return Err(BackgroundCalculationError::Cancelled);
},
() = skip_concurrency_limiter.cancelled() => {
// Some action that is part of a end user interaction requested logical size
@@ -2797,7 +2790,18 @@ impl Timeline {
.await
{
Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
Err(e) => Err(e),
Err(CalculateLogicalSizeError::Cancelled) => {
Err(BackgroundCalculationError::Cancelled)
}
Err(CalculateLogicalSizeError::Other(err)) => {
if let Some(PageReconstructError::AncestorStopping(_)) =
err.root_cause().downcast_ref()
{
Err(BackgroundCalculationError::Cancelled)
} else {
Err(BackgroundCalculationError::Other(err))
}
}
}
}
};
@@ -2809,11 +2813,8 @@ impl Timeline {
match try_once(attempt).await {
Ok(res) => return ControlFlow::Continue(res),
Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
Err(
e @ (CalculateLogicalSizeError::Decode(_)
| CalculateLogicalSizeError::PageRead(_)),
) => {
Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
Err(BackgroundCalculationError::Other(e)) => {
warn!(attempt, "initial size calculation failed: {e:?}");
// exponential back-off doesn't make sense at these long intervals;
// use fixed retry interval with generous jitter instead
@@ -3190,21 +3191,17 @@ impl Timeline {
}
// Recurse into ancestor if needed
if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
cont_lsn
);
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
cont_lsn
);
timeline_owned = timeline
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
.await?;
timeline = &*timeline_owned;
prev_lsn = None;
continue 'outer;
}
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
timeline = &*timeline_owned;
prev_lsn = None;
continue 'outer;
}
let guard = timeline.layers.read().await;
@@ -3353,10 +3350,10 @@ impl Timeline {
break None;
}
let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
// Not fully retrieved but no ancestor timeline.
// Not fully retrieved but no ancestor timeline.
if timeline.ancestor_timeline.is_none() {
break Some(keyspace);
};
}
// Now we see if there are keys covered by the image layer but does not exist in the
// image layer, which means that the key does not exist.
@@ -3376,7 +3373,7 @@ impl Timeline {
// Take the min to avoid reconstructing a page with data newer than request Lsn.
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
timeline_owned = timeline
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
.get_ready_ancestor_timeline(ctx)
.await
.map_err(GetVectoredError::GetReadyAncestorError)?;
timeline = &*timeline_owned;
@@ -3548,9 +3545,13 @@ impl Timeline {
async fn get_ready_ancestor_timeline(
&self,
ancestor: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
let ancestor = match self.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(GetReadyAncestorError::from(e)),
};
// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
@@ -3578,14 +3579,16 @@ impl Timeline {
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
// If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
return Err(GetReadyAncestorError::Cancelled);
return Err(GetReadyAncestorError::AncestorStopping(
ancestor.timeline_id,
));
}
Err(state) => {
return Err(GetReadyAncestorError::BadState {
timeline_id: ancestor.timeline_id,
state,
});
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
}
}
ancestor
@@ -3594,17 +3597,21 @@ impl Timeline {
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
timeline_id: ancestor.timeline_id,
state,
},
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
})?;
Ok(ancestor.clone())
Ok(ancestor)
}
pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
self.ancestor_timeline.clone()
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
self.timeline_id,
self.get_ancestor_timeline_id(),
)
})?;
Ok(Arc::clone(ancestor))
}
pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
@@ -3713,9 +3720,7 @@ impl Timeline {
return;
}
err @ Err(
FlushLayerError::NotRunning(_)
| FlushLayerError::Other(_)
| FlushLayerError::CreateImageLayersError(_),
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err.map(|_| ());
@@ -3761,10 +3766,7 @@ impl Timeline {
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
async fn flush_frozen_layers_and_wait(
&self,
last_record_lsn: Lsn,
) -> Result<(), FlushLayerError> {
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
@@ -3775,7 +3777,7 @@ impl Timeline {
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
return Err(FlushLayerError::NotRunning(flush_loop_state));
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3788,11 +3790,14 @@ impl Timeline {
{
let (last_result_counter, last_result) = &*rx.borrow();
if *last_result_counter >= my_flush_request {
if let Err(err) = last_result {
if let Err(_err) = last_result {
// We already logged the original error in
// flush_loop. We cannot propagate it to the caller
// here, because it might not be Cloneable
return Err(err.clone());
anyhow::bail!(
"Could not flush frozen layer. Request id: {}",
my_flush_request
);
} else {
return Ok(());
}
@@ -3801,7 +3806,7 @@ impl Timeline {
trace!("waiting for flush to complete");
tokio::select! {
rx_e = rx.changed() => {
rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
rx_e?;
},
// Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
// the notification from [`flush_loop`] that it completed.
@@ -3873,8 +3878,7 @@ impl Timeline {
EnumSet::empty(),
ctx,
)
.await
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
@@ -3898,8 +3902,7 @@ impl Timeline {
Some(metadata_keyspace.0.ranges[0].clone()),
ctx,
)
.await
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
.await?
} else {
None
};
@@ -3926,11 +3929,7 @@ impl Timeline {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let Some(layer) = self
.create_delta_layer(&frozen_layer, None, ctx)
.await
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
else {
let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
panic!("delta layer cannot be empty if no filter is applied");
};
(
@@ -3963,8 +3962,7 @@ impl Timeline {
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
// Schedule remote uploads that will reflect our new disk_consistent_lsn
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
}
// release lock on 'layers'
};
@@ -4840,7 +4838,7 @@ impl Timeline {
/// Currently, we don't make any attempt at removing unneeded page versions
/// within a layer file. We can only remove the whole file if it's fully
/// obsolete.
pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
// this is most likely the background tasks, but it might be the spawned task from
// immediate_gc
let _g = tokio::select! {
@@ -4853,7 +4851,7 @@ impl Timeline {
// Is the timeline being deleted?
if self.is_stopping() {
return Err(GcError::TimelineCancelled);
anyhow::bail!("timeline is Stopping");
}
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4911,7 +4909,7 @@ impl Timeline {
pitr_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
new_gc_cutoff: Lsn,
) -> Result<GcResult, GcError> {
) -> anyhow::Result<GcResult> {
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
let now = SystemTime::now();
@@ -4933,15 +4931,12 @@ impl Timeline {
// The GC cutoff should only ever move forwards.
let waitlist = {
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
if *write_guard > new_gc_cutoff {
return Err(GcError::BadLsn {
why: format!(
"Cannot move GC cutoff LSN backwards (was {}, new {})",
*write_guard, new_gc_cutoff
),
});
}
ensure!(
*write_guard <= new_gc_cutoff,
"Cannot move GC cutoff LSN backwards (was {}, new {})",
*write_guard,
new_gc_cutoff
);
write_guard.store_and_unlock(new_gc_cutoff)
};
waitlist.wait().await;
@@ -5050,14 +5045,7 @@ impl Timeline {
// This unconditionally schedules also an index_part.json update, even though, we will
// be doing one a bit later with the unlinked gc'd layers.
let disk_consistent_lsn = self.disk_consistent_lsn.load();
self.schedule_uploads(disk_consistent_lsn, None)
.map_err(|e| {
if self.cancel.is_cancelled() {
GcError::TimelineCancelled
} else {
GcError::Remote(e)
}
})?;
self.schedule_uploads(disk_consistent_lsn, None)?;
let gc_layers = layers_to_remove
.iter()
@@ -5066,15 +5054,7 @@ impl Timeline {
result.layers_removed = gc_layers.len() as u64;
self.remote_client
.schedule_gc_update(&gc_layers)
.map_err(|e| {
if self.cancel.is_cancelled() {
GcError::TimelineCancelled
} else {
GcError::Remote(e)
}
})?;
self.remote_client.schedule_gc_update(&gc_layers)?;
guard.finish_gc_timeline(&gc_layers);
@@ -5089,7 +5069,7 @@ impl Timeline {
result.layers_removed, new_gc_cutoff
);
result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
result.elapsed = now.elapsed()?;
Ok(result)
}
@@ -5381,102 +5361,6 @@ impl Timeline {
shard_count: self.tenant_shard_id.shard_count,
}
}
#[cfg(test)]
pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
self.last_record_lsn.advance(new_lsn);
}
/// Force create an image layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_image_layer(
self: &Arc<Timeline>,
lsn: Lsn,
mut images: Vec<(Key, Bytes)>,
check_start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let last_record_lsn = self.get_last_record_lsn();
assert!(
lsn <= last_record_lsn,
"advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
);
if let Some(check_start_lsn) = check_start_lsn {
assert!(lsn >= check_start_lsn);
}
images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
let min_key = *images.first().map(|(k, _)| k).unwrap();
let max_key = images.last().map(|(k, _)| k).unwrap().next();
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(min_key..max_key),
lsn,
ctx,
)
.await?;
for (key, img) in images {
image_layer_writer.put_image(key, img, ctx).await?;
}
let image_layer = image_layer_writer.finish(self, ctx).await?;
{
let mut guard = self.layers.write().await;
guard.force_insert_layer(image_layer);
}
Ok(())
}
/// Force create a delta layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_delta_layer(
self: &Arc<Timeline>,
mut deltas: Vec<(Key, Lsn, Value)>,
check_start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let last_record_lsn = self.get_last_record_lsn();
deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
assert!(
max_lsn <= last_record_lsn,
"advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
);
if let Some(check_start_lsn) = check_start_lsn {
assert!(min_lsn >= check_start_lsn);
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
min_key,
min_lsn..max_lsn,
ctx,
)
.await?;
for (key, lsn, val) in deltas {
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
}
let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
{
let mut guard = self.layers.write().await;
guard.force_insert_layer(delta_layer);
}
Ok(())
}
}
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

View File

@@ -1,6 +1,6 @@
use std::sync::Arc;
use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
use super::{layer_manager::LayerManager, Timeline};
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
#[error("shutting down, please retry later")]
ShuttingDown,
#[error("flushing failed")]
FlushAncestor(#[source] FlushLayerError),
FlushAncestor(#[source] anyhow::Error),
#[error("layer download failed")]
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
#[error("copying LSN prefix locally failed")]

View File

@@ -255,13 +255,6 @@ impl LayerManager {
updates.flush()
}
#[cfg(test)]
pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
let mut updates = self.layer_map.batch_update();
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
updates.flush()
}
/// Helper function to insert a layer into the layer map and file manager.
fn insert_historic_layer(
layer: Layer,

View File

@@ -344,21 +344,21 @@ macro_rules! with_file {
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open<P: AsRef<Utf8Path>>(
path: P,
pub async fn open(
path: &Utf8Path,
ctx: &RequestContext,
) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
}
/// Create a new file for writing. If the file exists, it will be truncated.
/// Like File::create.
pub async fn create<P: AsRef<Utf8Path>>(
path: P,
pub async fn create(
path: &Utf8Path,
ctx: &RequestContext,
) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(
path.as_ref(),
path,
OpenOptions::new().write(true).create(true).truncate(true),
ctx,
)
@@ -370,13 +370,12 @@ impl VirtualFile {
/// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
/// they will be applied also when the file is subsequently re-opened, not only
/// on the first time. Make sure that's sane!
pub async fn open_with_options<P: AsRef<Utf8Path>>(
path: P,
pub async fn open_with_options(
path: &Utf8Path,
open_options: &OpenOptions,
_ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
) -> Result<VirtualFile, std::io::Error> {
let path_ref = path.as_ref();
let path_str = path_ref.to_string();
let path_str = path.to_string();
let parts = path_str.split('/').collect::<Vec<&str>>();
let (tenant_id, shard_id, timeline_id) =
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
@@ -402,7 +401,7 @@ impl VirtualFile {
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = observe_duration!(StorageIoOperation::Open, {
open_options.open(path_ref.as_std_path()).await?
open_options.open(path.as_std_path()).await?
});
// Strip all options other than read and write.
@@ -418,7 +417,7 @@ impl VirtualFile {
let vfile = VirtualFile {
handle: RwLock::new(handle),
pos: 0,
path: path_ref.to_path_buf(),
path: path.to_path_buf(),
open_options: reopen_options,
tenant_id,
shard_id,

View File

@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
}
else if (state->wre_errno == ENOENT)
{
nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
LSN_FORMAT_ARGS(startptr), count);
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
LSN_FORMAT_ARGS(startptr));
return NeonWALReadRemote(state, buf, startptr, count, tli);
}
else
@@ -614,7 +614,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
uint32 startoff;
int segbytes;
int readbytes;
XLogSegNo lastRemovedSegNo;
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
@@ -690,23 +689,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
return false;
}
/*
* Recheck that the segment hasn't been removed while we were reading
* it.
*/
lastRemovedSegNo = XLogGetLastRemovedSegno();
if (state->seg.ws_segno <= lastRemovedSegNo)
{
char fname[MAXFNAMELEN];
state->wre_errno = ENOENT;
XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
fname, lastRemovedSegNo);
return false;
}
/* Update state for read */
recptr += readbytes;
nbytes -= readbytes;

View File

@@ -38,7 +38,6 @@ hmac.workspace = true
hostname.workspace = true
http.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -83,7 +82,6 @@ thiserror.workspace = true
tikv-jemallocator.workspace = true
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
tokio-postgres.workspace = true
tokio-postgres-rustls.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
@@ -96,8 +94,10 @@ url.workspace = true
urlencoding.workspace = true
utils.workspace = true
uuid.workspace = true
rustls-native-certs.workspace = true
webpki-roots.workspace = true
x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres-protocol.workspace = true
redis.workspace = true

View File

@@ -35,7 +35,7 @@ use crate::{
},
stream, url,
};
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
pub enum MaybeOwned<'a, T> {

View File

@@ -100,7 +100,6 @@ pub(super) async fn authenticate(
.dbname(&db_info.dbname)
.user(&db_info.user);
ctx.set_dbname(db_info.dbname.into());
ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone());
info!("woken up a compute node");

View File

@@ -11,6 +11,7 @@ use crate::{
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
use smol_str::SmolStr;
use std::{collections::HashSet, net::IpAddr, str::FromStr};
use thiserror::Error;
use tracing::{info, warn};
@@ -95,6 +96,13 @@ impl ComputeUserInfoMaybeEndpoint {
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user: RoleName = get_param("user")?.into();
// record the values if we have them
ctx.set_application(params.get("application_name").map(SmolStr::from));
ctx.set_user(user.clone());
if let Some(dbname) = params.get("database") {
ctx.set_dbname(dbname.into());
}
// Project name might be passed via PG's command-line options.
let endpoint_option = params
.options_raw()

View File

@@ -557,14 +557,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let config::ConcurrencyLockOptions {
shards,
limiter,
permits,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
"wake_compute_lock",
limiter,
permits,
shards,
timeout,
epoch,
@@ -603,19 +603,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let config::ConcurrencyLockOptions {
shards,
limiter,
permits,
epoch,
timeout,
} = args.connect_compute_lock.parse()?;
info!(
?limiter,
shards,
?epoch,
"Using NodeLocks (connect_compute)"
);
info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
let connect_compute_locks = console::locks::ApiLocks::new(
"connect_compute_lock",
limiter,
permits,
shards,
timeout,
epoch,

View File

@@ -10,14 +10,11 @@ use crate::{
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pq_proto::StartupMessageParams;
use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
use std::{io, net::SocketAddr, sync::Arc, time::Duration};
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
use tokio::net::TcpStream;
use tokio_postgres::tls::MakeTlsConnect;
use tokio_postgres_rustls::MakeRustlsConnect;
use tracing::{error, info, warn};
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -33,7 +30,7 @@ pub enum ConnectionError {
CouldNotConnect(#[from] io::Error),
#[error("{COULD_NOT_CONNECT}: {0}")]
TlsError(#[from] InvalidDnsNameError),
TlsError(#[from] native_tls::Error),
#[error("{COULD_NOT_CONNECT}: {0}")]
WakeComputeError(#[from] WakeComputeError),
@@ -260,7 +257,7 @@ pub struct PostgresConnection {
/// Socket connected to a compute node.
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
tokio::net::TcpStream,
tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
postgres_native_tls::TlsStream<tokio::net::TcpStream>,
>,
/// PostgreSQL connection parameters.
pub params: std::collections::HashMap<String, String>,
@@ -285,23 +282,12 @@ impl ConnCfg {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause);
let client_config = if allow_self_signed_compute {
// Allow all certificates for creating the connection
let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
rustls::ClientConfig::builder()
.dangerous()
.with_custom_certificate_verifier(verifier)
} else {
let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
rustls::ClientConfig::builder().with_root_certificates(root_store)
};
let client_config = client_config.with_no_client_auth();
let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
&mut mk_tls,
host,
)?;
let tls_connector = native_tls::TlsConnector::builder()
.danger_accept_invalid_certs(allow_self_signed_compute)
.build()
.unwrap();
let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
// connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
@@ -354,58 +340,6 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
Some(options)
}
fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
let der_certs = rustls_native_certs::load_native_certs()?;
let mut store = rustls::RootCertStore::empty();
store.add_parsable_certificates(der_certs);
Ok(Arc::new(store))
}
static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
#[derive(Debug)]
struct AcceptEverythingVerifier;
impl ServerCertVerifier for AcceptEverythingVerifier {
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
use rustls::SignatureScheme::*;
// The schemes for which `SignatureScheme::supported_in_tls13` returns true.
vec![
ECDSA_NISTP521_SHA512,
ECDSA_NISTP384_SHA384,
ECDSA_NISTP256_SHA256,
RSA_PSS_SHA512,
RSA_PSS_SHA384,
RSA_PSS_SHA256,
ED25519,
]
}
fn verify_server_cert(
&self,
_end_entity: &rustls::pki_types::CertificateDer<'_>,
_intermediates: &[rustls::pki_types::CertificateDer<'_>],
_server_name: &rustls::pki_types::ServerName<'_>,
_ocsp_response: &[u8],
_now: rustls::pki_types::UnixTime,
) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
Ok(rustls::client::danger::ServerCertVerified::assertion())
}
fn verify_tls12_signature(
&self,
_message: &[u8],
_cert: &rustls::pki_types::CertificateDer<'_>,
_dss: &rustls::DigitallySignedStruct,
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
}
fn verify_tls13_signature(
&self,
_message: &[u8],
_cert: &rustls::pki_types::CertificateDer<'_>,
_dss: &rustls::DigitallySignedStruct,
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -1,7 +1,7 @@
use crate::{
auth::{self, backend::AuthRateLimiter},
console::locks::ApiLocks,
rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
rate_limiter::RateBucketInfo,
scram::threadpool::ThreadPool,
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
Host,
@@ -580,18 +580,14 @@ impl RetryConfig {
}
/// Helper for cmdline cache options parsing.
#[derive(serde::Deserialize)]
pub struct ConcurrencyLockOptions {
/// The number of shards the lock map should have
pub shards: usize,
/// The number of allowed concurrent requests for each endpoitn
#[serde(flatten)]
pub limiter: RateLimiterConfig,
pub permits: usize,
/// Garbage collection epoch
#[serde(deserialize_with = "humantime_serde::deserialize")]
pub epoch: Duration,
/// Lock timeout
#[serde(deserialize_with = "humantime_serde::deserialize")]
pub timeout: Duration,
}
@@ -600,18 +596,13 @@ impl ConcurrencyLockOptions {
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
"shards=64,permits=100,epoch=10m,timeout=10ms";
"shards=64,permits=10,epoch=10m,timeout=10ms";
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
/// Parse lock options passed via cmdline.
/// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
fn parse(options: &str) -> anyhow::Result<Self> {
let options = options.trim();
if options.starts_with('{') && options.ends_with('}') {
return Ok(serde_json::from_str(options)?);
}
let mut shards = None;
let mut permits = None;
let mut epoch = None;
@@ -638,13 +629,9 @@ impl ConcurrencyLockOptions {
shards = Some(2);
}
let permits = permits.context("missing `permits`")?;
let out = Self {
shards: shards.context("missing `shards`")?,
limiter: RateLimiterConfig {
algorithm: RateLimitAlgorithm::Fixed,
initial_limit: permits,
},
permits: permits.context("missing `permits`")?,
epoch: epoch.context("missing `epoch`")?,
timeout: timeout.context("missing `timeout`")?,
};
@@ -670,8 +657,6 @@ impl FromStr for ConcurrencyLockOptions {
#[cfg(test)]
mod tests {
use crate::rate_limiter::Aimd;
use super::*;
#[test]
@@ -699,68 +684,36 @@ mod tests {
fn test_parse_lock_options() -> anyhow::Result<()> {
let ConcurrencyLockOptions {
epoch,
limiter,
permits,
shards,
timeout,
} = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
assert_eq!(epoch, Duration::from_secs(10 * 60));
assert_eq!(timeout, Duration::from_secs(1));
assert_eq!(shards, 32);
assert_eq!(limiter.initial_limit, 4);
assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
assert_eq!(permits, 4);
let ConcurrencyLockOptions {
epoch,
limiter,
permits,
shards,
timeout,
} = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
assert_eq!(epoch, Duration::from_secs(60));
assert_eq!(timeout, Duration::from_millis(100));
assert_eq!(shards, 16);
assert_eq!(limiter.initial_limit, 8);
assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
assert_eq!(permits, 8);
let ConcurrencyLockOptions {
epoch,
limiter,
permits,
shards,
timeout,
} = "permits=0".parse()?;
assert_eq!(epoch, Duration::ZERO);
assert_eq!(timeout, Duration::ZERO);
assert_eq!(shards, 2);
assert_eq!(limiter.initial_limit, 0);
assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
Ok(())
}
#[test]
fn test_parse_json_lock_options() -> anyhow::Result<()> {
let ConcurrencyLockOptions {
epoch,
limiter,
shards,
timeout,
} = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"#
.parse()?;
assert_eq!(epoch, Duration::from_secs(10 * 60));
assert_eq!(timeout, Duration::from_secs(1));
assert_eq!(shards, 32);
assert_eq!(limiter.initial_limit, 44);
assert_eq!(
limiter.algorithm,
RateLimitAlgorithm::Aimd {
conf: Aimd {
min: 5,
max: 500,
dec: 0.9,
inc: 10,
utilisation: 0.8
}
},
);
assert_eq!(permits, 0);
Ok(())
}

View File

@@ -15,11 +15,11 @@ use crate::{
error::ReportableError,
intern::ProjectIdInt,
metrics::ApiLockMetrics,
rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
scram, EndpointCacheKey,
};
use dashmap::DashMap;
use std::{hash::Hash, sync::Arc, time::Duration};
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio::time::Instant;
use tracing::info;
@@ -443,8 +443,8 @@ impl ApiCaches {
/// Various caches for [`console`](super).
pub struct ApiLocks<K> {
name: &'static str,
node_locks: DashMap<K, Arc<DynamicLimiter>>,
config: RateLimiterConfig,
node_locks: DashMap<K, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
@@ -452,6 +452,8 @@ pub struct ApiLocks<K> {
#[derive(Debug, thiserror::Error)]
pub enum ApiLockError {
#[error("lock was closed")]
AcquireError(#[from] tokio::sync::AcquireError),
#[error("permit could not be acquired")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
@@ -459,6 +461,7 @@ pub enum ApiLockError {
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
@@ -467,7 +470,7 @@ impl ReportableError for ApiLockError {
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
config: RateLimiterConfig,
permits: usize,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
@@ -476,7 +479,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
config,
permits,
timeout,
epoch,
metrics,
@@ -484,10 +487,8 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
}
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.config.initial_limit == 0 {
return Ok(WakeComputePermit {
permit: Token::disabled(),
});
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}
let now = Instant::now();
let semaphore = {
@@ -499,22 +500,24 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
.entry(key.clone())
.or_insert_with(|| {
self.metrics.semaphores_registered.inc();
DynamicLimiter::new(self.config)
Arc::new(Semaphore::new(self.permits))
})
.clone()
}
};
let permit = semaphore.acquire_deadline(now + self.timeout).await;
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
Ok(WakeComputePermit { permit: permit? })
Ok(WakeComputePermit {
permit: Some(permit??),
})
}
pub async fn garbage_collect_worker(&self) {
if self.config.initial_limit == 0 {
if self.permits == 0 {
return;
}
let mut interval =
@@ -544,21 +547,12 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
}
pub struct WakeComputePermit {
permit: Token,
// None if the lock is disabled
permit: Option<OwnedSemaphorePermit>,
}
impl WakeComputePermit {
pub fn should_check_cache(&self) -> bool {
!self.permit.is_disabled()
}
pub fn release(self, outcome: Outcome) {
self.permit.release(outcome)
}
pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
match res {
Ok(_) => self.release(Outcome::Success),
Err(_) => self.release(Outcome::Overload),
}
res
self.permit.is_some()
}
}

View File

@@ -13,7 +13,7 @@ use crate::{
http,
metrics::{CacheOutcome, Metrics},
rate_limiter::EndpointRateLimiter,
scram, EndpointCacheKey,
scram, EndpointCacheKey, Normalize,
};
use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt;
@@ -281,6 +281,14 @@ impl super::Api for Api {
return Ok(cached);
}
// check rate limit
if !self
.wake_compute_endpoint_rate_limiter
.check(user_info.endpoint.normalize().into(), 1)
{
return Err(WakeComputeError::TooManyConnections);
}
let permit = self.locks.get_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
@@ -293,16 +301,7 @@ impl super::Api for Api {
}
}
// check rate limit
if !self
.wake_compute_endpoint_rate_limiter
.check(user_info.endpoint.normalize_intern(), 1)
{
info!(key = &*key, "found cached compute node info");
return Err(WakeComputeError::TooManyConnections);
}
let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
let mut node = self.do_wake_compute(ctx, user_info).await?;
ctx.set_project(node.aux.clone());
let cold_start_info = node.aux.cold_start_info;
info!("woken up a compute node");

View File

@@ -2,7 +2,6 @@
use chrono::Utc;
use once_cell::sync::OnceCell;
use pq_proto::StartupMessageParams;
use smol_str::SmolStr;
use std::net::IpAddr;
use tokio::sync::mpsc;
@@ -47,7 +46,6 @@ pub struct RequestMonitoring {
pub(crate) auth_method: Option<AuthMethod>,
success: bool,
pub(crate) cold_start_info: ColdStartInfo,
pg_options: Option<StartupMessageParams>,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -104,7 +102,6 @@ impl RequestMonitoring {
success: false,
rejected: None,
cold_start_info: ColdStartInfo::Unknown,
pg_options: None,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -135,18 +132,6 @@ impl RequestMonitoring {
self.latency_timer.cold_start_info(info);
}
pub fn set_db_options(&mut self, options: StartupMessageParams) {
self.set_application(options.get("application_name").map(SmolStr::from));
if let Some(user) = options.get("user") {
self.set_user(user.into());
}
if let Some(dbname) = options.get("database") {
self.set_dbname(dbname.into());
}
self.pg_options = Some(options);
}
pub fn set_project(&mut self, x: MetricsAuxInfo) {
if self.endpoint_id.is_none() {
self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -170,10 +155,8 @@ impl RequestMonitoring {
}
}
fn set_application(&mut self, app: Option<SmolStr>) {
if let Some(app) = app {
self.application = Some(app);
}
pub fn set_application(&mut self, app: Option<SmolStr>) {
self.application = app.or_else(|| self.application.clone());
}
pub fn set_dbname(&mut self, dbname: DbName) {

View File

@@ -13,9 +13,7 @@ use parquet::{
},
record::RecordWriter,
};
use pq_proto::StartupMessageParams;
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use serde::ser::SerializeMap;
use tokio::{sync::mpsc, time};
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, Span};
@@ -89,7 +87,6 @@ pub struct RequestData {
database: Option<String>,
project: Option<String>,
branch: Option<String>,
pg_options: Option<String>,
auth_method: Option<&'static str>,
error: Option<&'static str>,
/// Success is counted if we form a HTTP response with sql rows inside
@@ -104,23 +101,6 @@ pub struct RequestData {
disconnect_timestamp: Option<chrono::NaiveDateTime>,
}
struct Options<'a> {
options: &'a StartupMessageParams,
}
impl<'a> serde::Serialize for Options<'a> {
fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut state = s.serialize_map(None)?;
for (k, v) in self.options.iter() {
state.serialize_entry(k, v)?;
}
state.end()
}
}
impl From<&RequestMonitoring> for RequestData {
fn from(value: &RequestMonitoring) -> Self {
Self {
@@ -133,10 +113,6 @@ impl From<&RequestMonitoring> for RequestData {
database: value.dbname.as_deref().map(String::from),
project: value.project.as_deref().map(String::from),
branch: value.branch.as_deref().map(String::from),
pg_options: value
.pg_options
.as_ref()
.and_then(|options| serde_json::to_string(&Options { options }).ok()),
auth_method: value.auth_method.as_ref().map(|x| match x {
super::AuthMethod::Web => "web",
super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -518,7 +494,6 @@ mod tests {
database: Some(hex::encode(rng.gen::<[u8; 16]>())),
project: Some(hex::encode(rng.gen::<[u8; 16]>())),
branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
pg_options: None,
auth_method: None,
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
region: "us-east-1",
@@ -595,15 +570,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315874, 3, 6000),
(1315867, 3, 6000),
(1315927, 3, 6000),
(1315884, 3, 6000),
(1316014, 3, 6000),
(1315856, 3, 6000),
(1315648, 3, 6000),
(1315884, 3, 6000),
(438913, 1, 2000)
(1315314, 3, 6000),
(1315307, 3, 6000),
(1315367, 3, 6000),
(1315324, 3, 6000),
(1315454, 3, 6000),
(1315296, 3, 6000),
(1315088, 3, 6000),
(1315324, 3, 6000),
(438713, 1, 2000)
]
);
@@ -633,11 +608,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1223214, 5, 10000),
(1229364, 5, 10000),
(1231158, 5, 10000),
(1230520, 5, 10000),
(1221798, 5, 10000)
(1222212, 5, 10000),
(1228362, 5, 10000),
(1230156, 5, 10000),
(1229518, 5, 10000),
(1220796, 5, 10000)
]
);
@@ -669,11 +644,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1208861, 5, 10000),
(1208592, 5, 10000),
(1208885, 5, 10000),
(1208873, 5, 10000),
(1209128, 5, 10000)
(1207859, 5, 10000),
(1207590, 5, 10000),
(1207883, 5, 10000),
(1207871, 5, 10000),
(1208126, 5, 10000)
]
);
@@ -698,15 +673,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315874, 3, 6000),
(1315867, 3, 6000),
(1315927, 3, 6000),
(1315884, 3, 6000),
(1316014, 3, 6000),
(1315856, 3, 6000),
(1315648, 3, 6000),
(1315884, 3, 6000),
(438913, 1, 2000)
(1315314, 3, 6000),
(1315307, 3, 6000),
(1315367, 3, 6000),
(1315324, 3, 6000),
(1315454, 3, 6000),
(1315296, 3, 6000),
(1315088, 3, 6000),
(1315324, 3, 6000),
(438713, 1, 2000)
]
);
@@ -743,7 +718,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
[(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
);
tmpdir.close().unwrap();

View File

@@ -3,7 +3,6 @@
use std::convert::Infallible;
use anyhow::{bail, Context};
use intern::{EndpointIdInt, EndpointIdTag, InternId};
use tokio::task::JoinError;
use tokio_util::sync::CancellationToken;
use tracing::warn;
@@ -130,22 +129,20 @@ macro_rules! smol_str_wrapper {
const POOLER_SUFFIX: &str = "-pooler";
impl EndpointId {
pub trait Normalize {
fn normalize(&self) -> Self;
}
impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
fn normalize(&self) -> Self {
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
stripped.into()
if self.as_ref().ends_with(POOLER_SUFFIX) {
let mut s = self.as_ref().to_string();
s.truncate(s.len() - POOLER_SUFFIX.len());
s.into()
} else {
self.clone()
}
}
fn normalize_intern(&self) -> EndpointIdInt {
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
EndpointIdTag::get_interner().get_or_intern(stripped)
} else {
self.into()
}
}
}
// 90% of role name strings are 20 characters or less.

View File

@@ -267,8 +267,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
};
drop(pause);
ctx.set_db_options(params.clone());
let hostname = mode.hostname(stream.get_ref());
let common_names = tls.map(|tls| &tls.common_names);

View File

@@ -84,8 +84,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
timeout: time::Duration,
) -> Result<PostgresConnection, Self::Error> {
let host = node_info.config.get_host()?;
let permit = self.locks.get_permit(&host).await?;
permit.release_result(node_info.connect(ctx, timeout).await)
let _permit = self.locks.get_permit(&host).await?;
node_info.connect(ctx, timeout).await
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {

View File

@@ -1,6 +1,2 @@
mod limit_algorithm;
mod limiter;
pub use limit_algorithm::{
aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
};
pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};

View File

@@ -1,275 +0,0 @@
//! Algorithms for controlling concurrency limits.
use parking_lot::Mutex;
use std::{pin::pin, sync::Arc, time::Duration};
use tokio::{
sync::Notify,
time::{error::Elapsed, timeout_at, Instant},
};
use self::aimd::Aimd;
pub mod aimd;
/// Whether a job succeeded or failed as a result of congestion/overload.
///
/// Errors not considered to be caused by overload should be ignored.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Outcome {
/// The job succeeded, or failed in a way unrelated to overload.
Success,
/// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
/// was observed.
Overload,
}
/// An algorithm for controlling a concurrency limit.
pub trait LimitAlgorithm: Send + Sync + 'static {
/// Update the concurrency limit in response to a new job completion.
fn update(&self, old_limit: usize, sample: Sample) -> usize;
}
/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub struct Sample {
pub(crate) latency: Duration,
/// Jobs in flight when the sample was taken.
pub(crate) in_flight: usize,
pub(crate) outcome: Outcome,
}
#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum RateLimitAlgorithm {
#[default]
Fixed,
Aimd {
#[serde(flatten)]
conf: Aimd,
},
}
pub struct Fixed;
impl LimitAlgorithm for Fixed {
fn update(&self, old_limit: usize, _sample: Sample) -> usize {
old_limit
}
}
#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
pub struct RateLimiterConfig {
#[serde(flatten)]
pub algorithm: RateLimitAlgorithm,
pub initial_limit: usize,
}
impl RateLimiterConfig {
pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
match self.algorithm {
RateLimitAlgorithm::Fixed => Box::new(Fixed),
RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
}
}
}
pub struct LimiterInner {
alg: Box<dyn LimitAlgorithm>,
available: usize,
limit: usize,
in_flight: usize,
}
impl LimiterInner {
fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
if let Some(outcome) = outcome {
let sample = Sample {
latency,
in_flight: self.in_flight,
outcome,
};
self.limit = self.alg.update(self.limit, sample);
}
}
fn take(&mut self, ready: &Notify) -> Option<()> {
if self.available > 1 {
self.available -= 1;
self.in_flight += 1;
// tell the next in the queue that there is a permit ready
if self.available > 1 {
ready.notify_one();
}
Some(())
} else {
None
}
}
}
/// Limits the number of concurrent jobs.
///
/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the
/// token once the job is finished.
///
/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
/// caused by overload (loss).
pub struct DynamicLimiter {
config: RateLimiterConfig,
inner: Mutex<LimiterInner>,
// to notify when a token is available
ready: Notify,
}
/// A concurrency token, required to run a job.
///
/// Release the token back to the [`DynamicLimiter`] after the job is complete.
pub struct Token {
start: Instant,
limiter: Option<Arc<DynamicLimiter>>,
}
/// A snapshot of the state of the [`DynamicLimiter`].
///
/// Not guaranteed to be consistent under high concurrency.
#[derive(Debug, Clone, Copy)]
pub struct LimiterState {
limit: usize,
in_flight: usize,
}
impl DynamicLimiter {
/// Create a limiter with a given limit control algorithm.
pub fn new(config: RateLimiterConfig) -> Arc<Self> {
let ready = Notify::new();
ready.notify_one();
Arc::new(Self {
inner: Mutex::new(LimiterInner {
alg: config.create_rate_limit_algorithm(),
available: config.initial_limit,
limit: config.initial_limit,
in_flight: 0,
}),
ready,
config,
})
}
/// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
///
/// Returns `None` if there are none available after `duration`.
pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
self.acquire_deadline(Instant::now() + duration).await
}
/// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
///
/// Returns `None` if there are none available after `deadline`.
pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
if self.config.initial_limit == 0 {
// If the rate limiter is disabled, we can always acquire a token.
Ok(Token::disabled())
} else {
let mut notified = pin!(self.ready.notified());
let mut ready = notified.as_mut().enable();
loop {
let mut limit = None;
if ready {
let mut inner = self.inner.lock();
if inner.take(&self.ready).is_some() {
break Ok(Token::new(self.clone()));
}
limit = Some(inner.limit);
}
match timeout_at(deadline, notified.as_mut()).await {
Ok(()) => ready = true,
Err(e) => {
let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
tracing::info!(limit, "could not acquire token in time");
break Err(e);
}
}
}
}
}
/// Return the concurrency [Token], along with the outcome of the job.
///
/// The [Outcome] of the job, and the time taken to perform it, may be used
/// to update the concurrency limit.
///
/// Set the outcome to `None` to ignore the job.
fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
tracing::info!("outcome is {:?}", outcome);
if self.config.initial_limit == 0 {
return;
}
let mut inner = self.inner.lock();
inner.update(start.elapsed(), outcome);
if inner.in_flight < inner.limit {
inner.available = inner.limit - inner.in_flight;
// At least 1 permit is now available
self.ready.notify_one();
}
inner.in_flight -= 1;
}
/// The current state of the limiter.
pub fn state(&self) -> LimiterState {
let inner = self.inner.lock();
LimiterState {
limit: inner.limit,
in_flight: inner.in_flight,
}
}
}
impl Token {
fn new(limiter: Arc<DynamicLimiter>) -> Self {
Self {
start: Instant::now(),
limiter: Some(limiter),
}
}
pub fn disabled() -> Self {
Self {
start: Instant::now(),
limiter: None,
}
}
pub fn is_disabled(&self) -> bool {
self.limiter.is_none()
}
pub fn release(mut self, outcome: Outcome) {
self.release_mut(Some(outcome))
}
pub fn release_mut(&mut self, outcome: Option<Outcome>) {
if let Some(limiter) = self.limiter.take() {
limiter.release_inner(self.start, outcome);
}
}
}
impl Drop for Token {
fn drop(&mut self) {
self.release_mut(None)
}
}
impl LimiterState {
/// The current concurrency limit.
pub fn limit(&self) -> usize {
self.limit
}
/// The number of jobs in flight.
pub fn in_flight(&self) -> usize {
self.in_flight
}
}

View File

@@ -1,184 +0,0 @@
use std::usize;
use super::{LimitAlgorithm, Outcome, Sample};
/// Loss-based congestion avoidance.
///
/// Additive-increase, multiplicative decrease.
///
/// Adds available currency when:
/// 1. no load-based errors are observed, and
/// 2. the utilisation of the current limit is high.
///
/// Reduces available concurrency by a factor when load-based errors are detected.
#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
pub struct Aimd {
/// Minimum limit for AIMD algorithm.
pub min: usize,
/// Maximum limit for AIMD algorithm.
pub max: usize,
/// Decrease AIMD decrease by value in case of error.
pub dec: f32,
/// Increase AIMD increase by value in case of success.
pub inc: usize,
/// A threshold below which the limit won't be increased.
pub utilisation: f32,
}
impl LimitAlgorithm for Aimd {
fn update(&self, old_limit: usize, sample: Sample) -> usize {
use Outcome::*;
match sample.outcome {
Success => {
let utilisation = sample.in_flight as f32 / old_limit as f32;
if utilisation > self.utilisation {
let limit = old_limit + self.inc;
let increased_limit = limit.clamp(self.min, self.max);
if increased_limit > old_limit {
tracing::info!(increased_limit, "limit increased");
}
increased_limit
} else {
old_limit
}
}
Overload => {
let limit = old_limit as f32 * self.dec;
// Floor instead of round, so the limit reduces even with small numbers.
// E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
let limit = limit.floor() as usize;
limit.clamp(self.min, self.max)
}
}
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use crate::rate_limiter::limit_algorithm::{
DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
};
use super::*;
#[tokio::test(start_paused = true)]
async fn should_decrease_limit_on_overload() {
let config = RateLimiterConfig {
initial_limit: 10,
algorithm: RateLimitAlgorithm::Aimd {
conf: Aimd {
min: 1,
max: 1500,
inc: 10,
dec: 0.5,
utilisation: 0.8,
},
},
};
let limiter = DynamicLimiter::new(config);
let token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
token.release(Outcome::Overload);
assert_eq!(limiter.state().limit(), 5, "overload: decrease");
}
#[tokio::test(start_paused = true)]
async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
let config = RateLimiterConfig {
initial_limit: 4,
algorithm: RateLimitAlgorithm::Aimd {
conf: Aimd {
min: 1,
max: 1500,
inc: 1,
dec: 0.5,
utilisation: 0.5,
},
},
};
let limiter = DynamicLimiter::new(config);
let token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
let _token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
let _token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
token.release(Outcome::Success);
assert_eq!(limiter.state().limit(), 5, "success: increase");
}
#[tokio::test(start_paused = true)]
async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
let config = RateLimiterConfig {
initial_limit: 4,
algorithm: RateLimitAlgorithm::Aimd {
conf: Aimd {
min: 1,
max: 1500,
inc: 10,
dec: 0.5,
utilisation: 0.5,
},
},
};
let limiter = DynamicLimiter::new(config);
let token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
token.release(Outcome::Success);
assert_eq!(
limiter.state().limit(),
4,
"success: ignore when < half limit"
);
}
#[tokio::test(start_paused = true)]
async fn should_not_change_limit_when_no_outcome() {
let config = RateLimiterConfig {
initial_limit: 10,
algorithm: RateLimitAlgorithm::Aimd {
conf: Aimd {
min: 1,
max: 1500,
inc: 10,
dec: 0.5,
utilisation: 0.5,
},
},
};
let limiter = DynamicLimiter::new(config);
let token = limiter
.acquire_timeout(Duration::from_millis(1))
.await
.unwrap();
drop(token);
assert_eq!(limiter.state().limit(), 10, "ignore");
}
}

View File

@@ -232,9 +232,9 @@ impl ConnectMechanism for TokioMechanism {
.connect_timeout(timeout);
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let res = config.connect(tokio_postgres::NoTls).await;
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
drop(pause);
let (client, connection) = permit.release_result(res)?;
drop(permit);
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
Ok(poll_client(

View File

@@ -17,7 +17,6 @@ use hyper1::http::HeaderValue;
use hyper1::Response;
use hyper1::StatusCode;
use hyper1::{HeaderMap, Request};
use pq_proto::StartupMessageParamsBuilder;
use serde_json::json;
use serde_json::Value;
use tokio::time;
@@ -193,13 +192,13 @@ fn get_conn_info(
let mut options = Option::None;
let mut params = StartupMessageParamsBuilder::default();
params.insert("user", &username);
params.insert("database", &dbname);
for (key, value) in pairs {
params.insert(&key, &value);
if key == "options" {
options = Some(NeonOptions::parse_options_raw(&value));
match &*key {
"options" => {
options = Some(NeonOptions::parse_options_raw(&value));
}
"application_name" => ctx.set_application(Some(value.into())),
_ => {}
}
}

View File

@@ -54,7 +54,6 @@ build-backend = "poetry.core.masonry.api"
exclude = [
"^vendor/",
"^target/",
"test_runner/performance/pgvector/loaddata.py",
]
check_untyped_defs = true
# Help mypy find imports when running against list of individual files.

View File

@@ -22,7 +22,8 @@ serde_with.workspace = true
workspace_hack.workspace = true
utils.workspace = true
async-stream.workspace = true
tokio-postgres-rustls.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres_ffi.workspace = true
tokio-stream.workspace = true
tokio-postgres.workspace = true
@@ -30,9 +31,6 @@ tokio-util = { workspace = true }
futures-util.workspace = true
itertools.workspace = true
camino.workspace = true
rustls.workspace = true
rustls-native-certs.workspace = true
once_cell.workspace = true
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }

View File

@@ -1,8 +1,7 @@
use std::{collections::HashSet, str::FromStr, sync::Arc};
use std::{collections::HashSet, str::FromStr};
use aws_sdk_s3::Client;
use futures::stream::{StreamExt, TryStreamExt};
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use postgres_ffi::{XLogFileName, PG_TLI};
use serde::Serialize;
@@ -71,12 +70,9 @@ pub async fn scan_safekeeper_metadata(
"checking bucket {}, region {}, dump_db_table {}",
bucket_config.bucket, bucket_config.region, dump_db_table
);
// Use rustls (Neon requires TLS)
let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
let client_config = rustls::ClientConfig::builder()
.with_root_certificates(root_store)
.with_no_client_auth();
let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
// Use the native TLS implementation (Neon requires TLS)
let tls_connector =
postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
@@ -238,11 +234,3 @@ async fn check_timeline(
is_deleted: false,
})
}
fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
let der_certs = rustls_native_certs::load_native_certs()?;
let mut store = rustls::RootCertStore::empty();
store.add_parsable_certificates(der_certs);
Ok(Arc::new(store))
}
static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();

View File

@@ -29,12 +29,13 @@ use safekeeper::defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::http;
use safekeeper::remove_wal;
use safekeeper::wal_service;
use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf;
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
use safekeeper::{control_file, BROKER_RUNTIME};
use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -440,6 +441,14 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
.map(|res| ("broker main".to_owned(), res));
tasks_handles.push(Box::pin(broker_task_handle));
let conf_ = conf.clone();
let wal_remover_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
.spawn(remove_wal::task_main(conf_))
.map(|res| ("WAL remover".to_owned(), res));
tasks_handles.push(Box::pin(wal_remover_handle));
set_build_info_metric(GIT_VERSION, BUILD_TAG);
// TODO: update tokio-stream, convert to real async Stream with

View File

@@ -2,7 +2,7 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8PathBuf;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
use std::path::Path;
use std::time::Instant;
use crate::control_file_upgrade::upgrade_control_file;
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
use crate::state::TimelinePersistentState;
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
use utils::{bin_ser::LeSer, id::TenantTimelineId};
use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
pub struct FileStorage {
// save timeline dir to avoid reconstructing it every time
timeline_dir: Utf8PathBuf,
no_sync: bool,
conf: SafeKeeperConf,
/// Last state persisted to disk.
state: TimelinePersistentState,
@@ -54,12 +54,13 @@ pub struct FileStorage {
impl FileStorage {
/// Initialize storage by loading state from disk.
pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
let timeline_dir = get_timeline_dir(conf, ttid);
let state = Self::load_control_file_from_dir(&timeline_dir)?;
let timeline_dir = conf.timeline_dir(ttid);
let state = Self::load_control_file_conf(conf, ttid)?;
Ok(FileStorage {
timeline_dir,
no_sync: conf.no_sync,
conf: conf.clone(),
state,
last_persist_at: Instant::now(),
})
@@ -73,7 +74,7 @@ impl FileStorage {
) -> Result<FileStorage> {
let store = FileStorage {
timeline_dir,
no_sync: conf.no_sync,
conf: conf.clone(),
state,
last_persist_at: Instant::now(),
};
@@ -101,9 +102,12 @@ impl FileStorage {
upgrade_control_file(buf, version)
}
/// Load control file from given directory.
pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
let path = timeline_dir.join(CONTROL_FILE_NAME);
/// Load control file for given ttid at path specified by conf.
pub fn load_control_file_conf(
conf: &SafeKeeperConf,
ttid: &TenantTimelineId,
) -> Result<TimelinePersistentState> {
let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
Self::load_control_file(path)
}
@@ -199,7 +203,7 @@ impl Storage for FileStorage {
})?;
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
// update internal state
self.state = s.clone();
@@ -229,13 +233,12 @@ mod test {
conf: &SafeKeeperConf,
ttid: &TenantTimelineId,
) -> Result<(FileStorage, TimelinePersistentState)> {
let timeline_dir = get_timeline_dir(conf, ttid);
fs::create_dir_all(&timeline_dir)
fs::create_dir_all(conf.timeline_dir(ttid))
.await
.expect("failed to create timeline dir");
Ok((
FileStorage::restore_new(ttid, conf)?,
FileStorage::load_control_file_from_dir(&timeline_dir)?,
FileStorage::load_control_file_conf(conf, ttid)?,
))
}
@@ -243,11 +246,11 @@ mod test {
conf: &SafeKeeperConf,
ttid: &TenantTimelineId,
) -> Result<(FileStorage, TimelinePersistentState)> {
let timeline_dir = get_timeline_dir(conf, ttid);
fs::create_dir_all(&timeline_dir)
fs::create_dir_all(conf.timeline_dir(ttid))
.await
.expect("failed to create timeline dir");
let state = TimelinePersistentState::empty();
let timeline_dir = conf.timeline_dir(ttid);
let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
Ok((storage, state))
}
@@ -288,7 +291,7 @@ mod test {
.await
.expect("failed to persist state");
}
let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
let mut data = fs::read(&control_path).await.unwrap();
data[0] += 1; // change the first byte of the file to fail checksum validation
fs::write(&control_path, &data)

View File

@@ -15,10 +15,10 @@ use crate::{
control_file::{FileStorage, Storage},
pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
state::TimelinePersistentState,
timeline::{FullAccessTimeline, Timeline, TimelineError},
timeline::{Timeline, TimelineError},
wal_backup::copy_s3_segments,
wal_storage::{wal_file_paths, WalReader},
GlobalTimelines,
GlobalTimelines, SafeKeeperConf,
};
// we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,14 +46,12 @@ pub async fn handle_request(request: Request) -> Result<()> {
}
}
let source_tli = request.source.full_access_guard().await?;
let conf = &GlobalTimelines::get_global_config();
let ttid = request.destination_ttid;
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
let (mem_state, state) = source_tli.get_state().await;
let (mem_state, state) = request.source.get_state().await;
let start_lsn = state.timeline_start_lsn;
if start_lsn == Lsn::INVALID {
bail!("timeline is not initialized");
@@ -62,7 +60,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
{
let commit_lsn = mem_state.commit_lsn;
let flush_lsn = source_tli.get_flush_lsn().await;
let flush_lsn = request.source.get_flush_lsn().await;
info!(
"collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -129,8 +127,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
.await?;
copy_disk_segments(
&source_tli,
conf,
&state,
wal_seg_size,
&request.source.ttid,
new_backup_lsn,
request.until_lsn,
&tli_dir_path,
@@ -159,13 +159,21 @@ pub async fn handle_request(request: Request) -> Result<()> {
}
async fn copy_disk_segments(
tli: &FullAccessTimeline,
conf: &SafeKeeperConf,
persisted_state: &TimelinePersistentState,
wal_seg_size: usize,
source_ttid: &TenantTimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
tli_dir_path: &Utf8PathBuf,
) -> Result<()> {
let mut wal_reader = tli.get_walreader(start_lsn).await?;
let mut wal_reader = WalReader::new(
conf.workdir.clone(),
conf.timeline_dir(source_ttid),
persisted_state,
start_lsn,
true,
)?;
let mut buf = [0u8; MAX_SEND_SIZE];

View File

@@ -10,7 +10,6 @@ use std::sync::Arc;
use anyhow::bail;
use anyhow::Result;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use chrono::{DateTime, Utc};
use postgres_ffi::XLogSegNo;
use postgres_ffi::MAX_SEND_SIZE;
@@ -27,8 +26,7 @@ use crate::safekeeper::TermHistory;
use crate::send_wal::WalSenderState;
use crate::state::TimelineMemState;
use crate::state::TimelinePersistentState;
use crate::timeline::get_timeline_dir;
use crate::timeline::FullAccessTimeline;
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
use crate::SafeKeeperConf;
@@ -70,7 +68,6 @@ pub struct Response {
pub struct TimelineDumpSer {
pub tli: Arc<crate::timeline::Timeline>,
pub args: Args,
pub timeline_dir: Utf8PathBuf,
pub runtime: Arc<tokio::runtime::Runtime>,
}
@@ -88,20 +85,14 @@ impl Serialize for TimelineDumpSer {
where
S: serde::Serializer,
{
let dump = self.runtime.block_on(build_from_tli_dump(
&self.tli,
&self.args,
&self.timeline_dir,
));
let dump = self
.runtime
.block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
dump.serialize(serializer)
}
}
async fn build_from_tli_dump(
timeline: &Arc<crate::timeline::Timeline>,
args: &Args,
timeline_dir: &Utf8Path,
) -> Timeline {
async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
let control_file = if args.dump_control_file {
let mut state = timeline.get_state().await.1;
if !args.dump_term_history {
@@ -121,8 +112,7 @@ async fn build_from_tli_dump(
let disk_content = if args.dump_disk_content {
// build_disk_content can fail, but we don't want to fail the whole
// request because of that.
// Note: timeline can be in offloaded state, this is not a problem.
build_disk_content(timeline_dir).ok()
build_disk_content(&timeline.timeline_dir).ok()
} else {
None
};
@@ -196,7 +186,6 @@ pub struct FileInfo {
pub async fn build(args: Args) -> Result<Response> {
let start_time = Utc::now();
let timelines_count = GlobalTimelines::timelines_count();
let config = GlobalTimelines::get_global_config();
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
// If both tenant_id and timeline_id are specified, we can just get the
@@ -234,11 +223,12 @@ pub async fn build(args: Args) -> Result<Response> {
timelines.push(TimelineDumpSer {
tli,
args: args.clone(),
timeline_dir: get_timeline_dir(&config, &ttid),
runtime: runtime.clone(),
});
}
let config = GlobalTimelines::get_global_config();
Ok(Response {
start_time,
finish_time: Utc::now(),
@@ -326,19 +316,27 @@ pub struct TimelineDigest {
}
pub async fn calculate_digest(
tli: &FullAccessTimeline,
tli: &Arc<crate::timeline::Timeline>,
request: TimelineDigestRequest,
) -> Result<TimelineDigest> {
if request.from_lsn > request.until_lsn {
bail!("from_lsn is greater than until_lsn");
}
let conf = GlobalTimelines::get_global_config();
let (_, persisted_state) = tli.get_state().await;
if persisted_state.timeline_start_lsn > request.from_lsn {
bail!("requested LSN is before the start of the timeline");
}
let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
let mut wal_reader = WalReader::new(
conf.workdir.clone(),
tli.timeline_dir.clone(),
&persisted_state,
request.from_lsn,
true,
)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; MAX_SEND_SIZE];

View File

@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
}
}
/// Augment AcceptorState with last_log_term for convenience
/// Augment AcceptorState with epoch for convenience
#[derive(Debug, Serialize, Deserialize)]
pub struct AcceptorStateStatus {
pub term: Term,
pub epoch: Term, // aka last_log_term
pub epoch: Term,
pub term_history: Vec<TermSwitchApiEntry>,
}
@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
let (inmem, state) = tli.get_state().await;
let flush_lsn = tli.get_flush_lsn().await;
let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
let epoch = state.acceptor_state.get_epoch(flush_lsn);
let term_history = state
.acceptor_state
.term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
.collect();
let acc_state = AcceptorStateStatus {
term: state.acceptor_state.term,
epoch: last_log_term,
epoch,
term_history,
};
@@ -249,10 +249,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let tli = tli
.full_access_guard()
.await
.map_err(ApiError::InternalServerError)?;
let response = debug_dump::calculate_digest(&tli, request)
.await
@@ -272,12 +268,8 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
let filename: String = parse_request_param(&request, "filename")?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let tli = tli
.full_access_guard()
.await
.map_err(ApiError::InternalServerError)?;
let filepath = tli.get_timeline_dir().join(filename);
let filepath = tli.timeline_dir.join(filename);
let mut file = File::open(&filepath)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -295,7 +287,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
.map_err(|e| ApiError::InternalServerError(e.into()))
}
/// Force persist control file.
/// Force persist control file and remove old WAL.
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
@@ -305,13 +297,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
);
let tli = GlobalTimelines::get(ttid)?;
tli.write_shared_state()
.await
.sk
.state
.flush()
tli.maybe_persist_control_file(true)
.await
.map_err(ApiError::InternalServerError)?;
tli.remove_old_wal()
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}

View File

@@ -6,6 +6,8 @@
//! modifications in tests.
//!
use std::sync::Arc;
use anyhow::Context;
use bytes::Bytes;
use postgres_backend::QueryError;
@@ -21,7 +23,7 @@ use crate::safekeeper::{
};
use crate::safekeeper::{Term, TermHistory, TermLsn};
use crate::state::TimelinePersistentState;
use crate::timeline::FullAccessTimeline;
use crate::timeline::Timeline;
use crate::GlobalTimelines;
use postgres_backend::PostgresBackend;
use postgres_ffi::encode_logical_message;
@@ -102,8 +104,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
async fn prepare_safekeeper(
ttid: TenantTimelineId,
pg_version: u32,
) -> anyhow::Result<FullAccessTimeline> {
let tli = GlobalTimelines::create(
) -> anyhow::Result<Arc<Timeline>> {
GlobalTimelines::create(
ttid,
ServerInfo {
pg_version,
@@ -113,16 +115,10 @@ async fn prepare_safekeeper(
Lsn::INVALID,
Lsn::INVALID,
)
.await?;
tli.full_access_guard().await
.await
}
async fn send_proposer_elected(
tli: &FullAccessTimeline,
term: Term,
lsn: Lsn,
) -> anyhow::Result<()> {
async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
// add new term to existing history
let history = tli.get_state().await.1.acceptor_state.term_history;
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -151,7 +147,7 @@ pub struct InsertedWAL {
/// Extend local WAL with new LogicalMessage record. To do that,
/// create AppendRequest with new WAL and pass it to safekeeper.
pub async fn append_logical_message(
tli: &FullAccessTimeline,
tli: &Arc<Timeline>,
msg: &AppendLogicalMessage,
) -> anyhow::Result<InsertedWAL> {
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
@@ -169,7 +165,7 @@ pub async fn append_logical_message(
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
h: AppendRequestHeader {
term: msg.term,
term_start_lsn: begin_lsn,
epoch_start_lsn: begin_lsn,
begin_lsn,
end_lsn,
commit_lsn,

View File

@@ -7,7 +7,10 @@ use tokio::runtime::Runtime;
use std::time::Duration;
use storage_broker::Uri;
use utils::{auth::SwappableJwtAuth, id::NodeId};
use utils::{
auth::SwappableJwtAuth,
id::{NodeId, TenantId, TenantTimelineId},
};
mod auth;
pub mod broker;
@@ -86,6 +89,15 @@ pub struct SafeKeeperConf {
}
impl SafeKeeperConf {
pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.workdir.join(tenant_id.to_string())
}
pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
self.tenant_dir(&ttid.tenant_id)
.join(ttid.timeline_id.to_string())
}
pub fn is_wal_backup_enabled(&self) -> bool {
self.remote_storage.is_some() && self.wal_backup_enabled
}

View File

@@ -17,7 +17,7 @@ use utils::{
use crate::{
control_file, debug_dump,
http::routes::TimelineStatus,
timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
timeline::{Timeline, TimelineError},
wal_storage::{self, Storage},
GlobalTimelines, SafeKeeperConf,
};
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
}
// Move timeline dir to the correct location
let timeline_path = get_timeline_dir(conf, &ttid);
let timeline_path = conf.timeline_dir(&ttid);
info!(
"moving timeline {} from {} to {}",
ttid, tmp_path, timeline_path
);
tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
tokio::fs::rename(tmp_path, &timeline_path).await?;
let tli = GlobalTimelines::load_timeline(&guard, ttid)

View File

@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
use crate::safekeeper::AcceptorProposerMessage;
use crate::safekeeper::ProposerAcceptorMessage;
use crate::safekeeper::ServerInfo;
use crate::timeline::FullAccessTimeline;
use crate::timeline::Timeline;
use crate::wal_service::ConnectionId;
use crate::GlobalTimelines;
use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
&mut self,
pgb: &mut PostgresBackend<IO>,
) -> Result<(), QueryError> {
let mut tli: Option<FullAccessTimeline> = None;
let mut tli: Option<Arc<Timeline>> = None;
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
// Log the result and probably send it to the client, closing the stream.
let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
&mut self,
pgb: &mut PostgresBackend<IO>,
tli: &mut Option<FullAccessTimeline>,
tli: &mut Option<Arc<Timeline>>,
) -> Result<(), CopyStreamHandlerEnd> {
// Notify the libpq client that it's allowed to send `CopyData` messages
pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
async fn read_first_message(
&mut self,
) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
// Receive information about server to create timeline, if not yet.
let next_msg = read_message(self.pgb_reader).await?;
let tli = match next_msg {
@@ -337,10 +337,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
system_id: greeting.system_id,
wal_seg_size: greeting.wal_seg_size,
};
let tli =
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
.await?;
tli.full_access_guard().await?
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
}
_ => {
return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -356,7 +353,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
msg_tx: Sender<ProposerAcceptorMessage>,
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
next_msg: ProposerAcceptorMessage,
) -> Result<(), CopyStreamHandlerEnd> {
*self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -451,7 +448,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
/// replies to reply_tx; reading from socket and writing to disk in parallel is
/// beneficial for performance, this struct provides writing to disk part.
pub struct WalAcceptor {
tli: FullAccessTimeline,
tli: Arc<Timeline>,
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
conn_id: Option<ConnectionId>,
@@ -464,7 +461,7 @@ impl WalAcceptor {
///
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
pub fn spawn(
tli: FullAccessTimeline,
tli: Arc<Timeline>,
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
conn_id: Option<ConnectionId>,

View File

@@ -2,7 +2,7 @@
//! provide it, i.e. safekeeper lags too much.
use std::time::SystemTime;
use std::{fmt, pin::pin};
use std::{fmt, pin::pin, sync::Arc};
use anyhow::{bail, Context};
use futures::StreamExt;
@@ -21,7 +21,6 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
use crate::timeline::FullAccessTimeline;
use crate::{
http::routes::TimelineStatus,
receive_wal::MSG_QUEUE_SIZE,
@@ -29,14 +28,14 @@ use crate::{
AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
TermLsn, VoteRequest,
},
timeline::PeerInfo,
timeline::{PeerInfo, Timeline},
SafeKeeperConf,
};
/// Entrypoint for per timeline task which always runs, checking whether
/// recovery for this safekeeper is needed and starting it if so.
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
info!("started");
let cancel = tli.cancel.clone();
@@ -48,87 +47,6 @@ pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
}
}
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
/// to fetch, and we can do that without running elections; 2) there is no
/// actively streaming compute, as we don't want to compete with it.
///
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
/// to its last_log_term so we are sure such a leader ever had been elected.
///
/// All possible donors are returned so that we could keep connection to the
/// current one if it is good even if it slightly lags behind.
///
/// Note that term conditions above might be not met, but safekeepers are
/// still not aligned on last flush_lsn. Generally in this case until
/// elections are run it is not possible to say which safekeeper should
/// recover from which one -- history which would be committed is different
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
/// Thus we don't try to predict it here.
async fn recovery_needed(
tli: &FullAccessTimeline,
heartbeat_timeout: Duration,
) -> RecoveryNeededInfo {
let ss = tli.read_shared_state().await;
let term = ss.sk.state.acceptor_state.term;
let last_log_term = ss.sk.get_last_log_term();
let flush_lsn = ss.sk.flush_lsn();
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
let mut peers = ss.get_peers(heartbeat_timeout);
// Sort by <last log term, lsn> pairs.
peers.sort_by(|p1, p2| {
let tl1 = TermLsn {
term: p1.last_log_term,
lsn: p1.flush_lsn,
};
let tl2 = TermLsn {
term: p2.last_log_term,
lsn: p2.flush_lsn,
};
tl2.cmp(&tl1) // desc
});
let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
let donors = if num_streaming_computes > 0 {
vec![] // If there is a streaming compute, don't try to recover to not intervene.
} else {
peers
.iter()
.filter_map(|candidate| {
// Are we interested in this candidate?
let candidate_tl = TermLsn {
term: candidate.last_log_term,
lsn: candidate.flush_lsn,
};
let my_tl = TermLsn {
term: last_log_term,
lsn: flush_lsn,
};
if my_tl < candidate_tl {
// Yes, we are interested. Can we pull from it without
// (re)running elections? It is possible if 1) his term
// is equal to his last_log_term so we could act on
// behalf of leader of this term (we must be sure he was
// ever elected) and 2) our term is not higher, or we'll refuse data.
if candidate.term == candidate.last_log_term && candidate.term >= term {
Some(Donor::from(candidate))
} else {
None
}
} else {
None
}
})
.collect()
};
RecoveryNeededInfo {
term,
last_log_term,
flush_lsn,
peers,
num_streaming_computes,
donors,
}
}
/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
/// fields to explain the choice.
#[derive(Debug)]
@@ -195,10 +113,10 @@ impl From<&PeerInfo> for Donor {
const CHECK_INTERVAL_MS: u64 = 2000;
/// Check regularly whether we need to start recovery.
async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
loop {
let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
match recovery_needed_info.donors.first() {
Some(donor) => {
info!(
@@ -228,7 +146,7 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
/// Recover from the specified donor. Returns message explaining normal finish
/// reason or error.
async fn recover(
tli: FullAccessTimeline,
tli: Arc<Timeline>,
donor: &Donor,
conf: &SafeKeeperConf,
) -> anyhow::Result<String> {
@@ -314,7 +232,7 @@ async fn recover(
// Pull WAL from donor, assuming handshake is already done.
async fn recovery_stream(
tli: FullAccessTimeline,
tli: Arc<Timeline>,
donor: &Donor,
start_streaming_at: Lsn,
conf: &SafeKeeperConf,
@@ -398,7 +316,7 @@ async fn network_io(
physical_stream: ReplicationStream,
msg_tx: Sender<ProposerAcceptorMessage>,
donor: Donor,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
conf: SafeKeeperConf,
) -> anyhow::Result<Option<String>> {
let mut physical_stream = pin!(physical_stream);
@@ -419,7 +337,7 @@ async fn network_io(
ReplicationMessage::XLogData(xlog_data) => {
let ar_hdr = AppendRequestHeader {
term: donor.term,
term_start_lsn: Lsn::INVALID, // unused
epoch_start_lsn: Lsn::INVALID, // unused
begin_lsn: Lsn(xlog_data.wal_start()),
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
@@ -447,7 +365,7 @@ async fn network_io(
}
ReplicationMessage::PrimaryKeepAlive(_) => {
// keepalive means nothing is being streamed for a while. Check whether we need to stop.
let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
// do current donors still contain one we currently connected to?
if !recovery_needed_info
.donors

View File

@@ -1,25 +1,41 @@
use utils::lsn::Lsn;
//! Thread removing old WAL.
use crate::timeline_manager::StateSnapshot;
use std::time::Duration;
/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
/// offloading.
/// While it is safe to use inmem values for determining horizon,
/// we use persistent to make possible normal states less surprising.
/// All segments covering LSNs before horizon_lsn can be removed.
pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
use std::cmp::min;
use tokio::time::sleep;
use tracing::*;
let mut horizon_lsn = min(
state.cfile_remote_consistent_lsn,
state.cfile_peer_horizon_lsn,
);
// we don't want to remove WAL that is not yet offloaded to s3
horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
use crate::{GlobalTimelines, SafeKeeperConf};
pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
let wal_removal_interval = Duration::from_millis(5000);
loop {
let now = tokio::time::Instant::now();
let tlis = GlobalTimelines::get_all();
for tli in &tlis {
let ttid = tli.ttid;
async {
if let Err(e) = tli.maybe_persist_control_file(false).await {
warn!("failed to persist control file: {e}");
}
if let Err(e) = tli.remove_old_wal().await {
error!("failed to remove WAL: {}", e);
}
}
.instrument(info_span!("WAL removal", ttid = %ttid))
.await;
}
let elapsed = now.elapsed();
let total_timelines = tlis.len();
if elapsed > wal_removal_interval {
info!(
"WAL removal is too long, processed {} timelines in {:?}",
total_timelines, elapsed
);
}
sleep(wal_removal_interval).await;
}
horizon_lsn
}

View File

@@ -10,6 +10,7 @@ use std::cmp::max;
use std::cmp::min;
use std::fmt;
use std::io::Read;
use std::time::Duration;
use storage_broker::proto::SafekeeperTimelineInfo;
use tracing::*;
@@ -187,8 +188,8 @@ pub struct AcceptorState {
}
impl AcceptorState {
/// acceptor's last_log_term is the term of the highest entry in the log
pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
/// acceptor's epoch is the term of the highest entry in the log
pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
let th = self.term_history.up_to(flush_lsn);
match th.0.last() {
Some(e) => e.term,
@@ -304,9 +305,9 @@ pub struct AppendRequest {
pub struct AppendRequestHeader {
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term,
// TODO: remove this field from the protocol, it in unused -- LSN of term
// switch can be taken from ProposerElected (as well as from term history).
pub term_start_lsn: Lsn,
// TODO: remove this field, it in unused -- LSN of term switch can be taken
// from ProposerElected (as well as from term history).
pub epoch_start_lsn: Lsn,
/// start position of message in WAL
pub begin_lsn: Lsn,
/// end position of message in WAL
@@ -325,10 +326,9 @@ pub struct AppendResponse {
// Current term of the safekeeper; if it is higher than proposer's, the
// compute is out of date.
pub term: Term,
// Flushed end of wal on safekeeper; one should be always mindful from what
// term history this value comes, either checking history directly or
// observing term being set to one for which WAL truncation is known to have
// happened.
// NOTE: this is physical end of wal on safekeeper; currently it doesn't
// make much sense without taking epoch into account, as history can be
// diverged.
pub flush_lsn: Lsn,
// We report back our awareness about which WAL is committed, as this is
// a criterion for walproposer --sync mode exit
@@ -482,8 +482,8 @@ impl AcceptorProposerMessage {
/// - messages from broker peers
pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
/// LSN since the proposer safekeeper currently talking to appends WAL;
/// determines last_log_term switch point.
pub term_start_lsn: Lsn,
/// determines epoch switch point.
pub epoch_start_lsn: Lsn,
pub state: TimelineState<CTRL>, // persistent state storage
pub wal_store: WAL,
@@ -511,7 +511,7 @@ where
}
Ok(SafeKeeper {
term_start_lsn: Lsn(0),
epoch_start_lsn: Lsn(0),
state: TimelineState::new(state),
wal_store,
node_id,
@@ -531,10 +531,8 @@ where
self.state.acceptor_state.term
}
pub fn get_last_log_term(&self) -> Term {
self.state
.acceptor_state
.get_last_log_term(self.flush_lsn())
pub fn get_epoch(&self) -> Term {
self.state.acceptor_state.get_epoch(self.flush_lsn())
}
/// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -715,7 +713,7 @@ where
// proceed, but to prevent commit_lsn surprisingly going down we should
// either refuse the session (simpler) or skip the part we already have
// from the stream (can be implemented).
if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
msg.term, self.flush_lsn(), msg.start_streaming_at)
}
@@ -790,7 +788,7 @@ where
// Cache LSN where term starts to immediately fsync control file with
// commit_lsn once we reach it -- sync-safekeepers finishes when
// persisted commit_lsn on majority of safekeepers aligns.
self.term_start_lsn = match msg.term_history.0.last() {
self.epoch_start_lsn = match msg.term_history.0.last() {
None => bail!("proposer elected with empty term history"),
Some(term_lsn_start) => term_lsn_start.lsn,
};
@@ -816,17 +814,35 @@ where
self.state.inmem.commit_lsn = commit_lsn;
// If new commit_lsn reached term switch, force sync of control
// If new commit_lsn reached epoch switch, force sync of control
// file: walproposer in sync mode is very interested when this
// happens. Note: this is for sync-safekeepers mode only, as
// otherwise commit_lsn might jump over term_start_lsn.
if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
// otherwise commit_lsn might jump over epoch_start_lsn.
if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
self.state.flush().await?;
}
Ok(())
}
/// Persist control file if there is something to save and enough time
/// passed after the last save.
pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
return Ok(false);
}
let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
|| self.state.inmem.backup_lsn > self.state.backup_lsn
|| self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
|| self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
if need_persist {
self.state.flush().await?;
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
}
Ok(need_persist)
}
/// Handle request to append WAL.
#[allow(clippy::comparison_chain)]
async fn handle_append_request(
@@ -917,7 +933,7 @@ where
// Note: the check is too restrictive, generally we can update local
// commit_lsn if our history matches (is part of) history of advanced
// commit_lsn provider.
if sk_info.last_log_term == self.get_last_log_term() {
if sk_info.last_log_term == self.get_epoch() {
self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
}
}
@@ -1063,7 +1079,7 @@ mod tests {
}
#[tokio::test]
async fn test_last_log_term_switch() {
async fn test_epoch_switch() {
let storage = InMemoryState {
persisted_state: test_sk_state(),
};
@@ -1073,7 +1089,7 @@ mod tests {
let mut ar_hdr = AppendRequestHeader {
term: 1,
term_start_lsn: Lsn(3),
epoch_start_lsn: Lsn(3),
begin_lsn: Lsn(1),
end_lsn: Lsn(2),
commit_lsn: Lsn(0),
@@ -1098,14 +1114,14 @@ mod tests {
.await
.unwrap();
// check that AppendRequest before term_start_lsn doesn't switch last_log_term.
// check that AppendRequest before epochStartLsn doesn't switch epoch
let resp = sk
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
.await;
assert!(resp.is_ok());
assert_eq!(sk.get_last_log_term(), 0);
assert_eq!(sk.get_epoch(), 0);
// but record at term_start_lsn does the switch
// but record at epochStartLsn does the switch
ar_hdr.begin_lsn = Lsn(2);
ar_hdr.end_lsn = Lsn(3);
append_request = AppendRequest {
@@ -1117,7 +1133,7 @@ mod tests {
.await;
assert!(resp.is_ok());
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
assert_eq!(sk.get_last_log_term(), 1);
assert_eq!(sk.get_epoch(), 1);
}
#[test]

View File

@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
use crate::metrics::RECEIVED_PS_FEEDBACKS;
use crate::receive_wal::WalReceivers;
use crate::safekeeper::{Term, TermLsn};
use crate::timeline::FullAccessTimeline;
use crate::timeline::Timeline;
use crate::wal_service::ConnectionId;
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
@@ -387,10 +387,8 @@ impl SafekeeperPostgresHandler {
term: Option<Term>,
) -> Result<(), QueryError> {
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
let full_access = tli.full_access_guard().await?;
if let Err(end) = self
.handle_start_replication_guts(pgb, start_pos, term, full_access)
.handle_start_replication_guts(pgb, start_pos, term, tli.clone())
.await
{
let info = tli.get_safekeeper_info(&self.conf).await;
@@ -407,7 +405,7 @@ impl SafekeeperPostgresHandler {
pgb: &mut PostgresBackend<IO>,
start_pos: Lsn,
term: Option<Term>,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
) -> Result<(), CopyStreamHandlerEnd> {
let appname = self.appname.clone();
@@ -450,7 +448,14 @@ impl SafekeeperPostgresHandler {
// switch to copy
pgb.write_message(&BeMessage::CopyBothResponse).await?;
let wal_reader = tli.get_walreader(start_pos).await?;
let (_, persisted_state) = tli.get_state().await;
let wal_reader = WalReader::new(
self.conf.workdir.clone(),
self.conf.timeline_dir(&tli.ttid),
&persisted_state,
start_pos,
self.conf.is_wal_backup_enabled(),
)?;
// Split to concurrently receive and send data; replies are generally
// not synchronized with sends, so this avoids deadlocks.
@@ -527,7 +532,7 @@ impl EndWatch {
/// A half driving sending WAL.
struct WalSender<'a, IO> {
pgb: &'a mut PostgresBackend<IO>,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
appname: Option<String>,
// Position since which we are sending next chunk.
start_pos: Lsn,
@@ -736,7 +741,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
struct ReplyReader<IO> {
reader: PostgresBackendReader<IO>,
ws_guard: Arc<WalSenderGuard>,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
}
impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {

View File

@@ -3,14 +3,14 @@
use anyhow::{anyhow, bail, Result};
use camino::Utf8PathBuf;
use postgres_ffi::XLogSegNo;
use serde::{Deserialize, Serialize};
use tokio::fs;
use tokio_util::sync::CancellationToken;
use utils::id::TenantId;
use std::cmp::max;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,6 +26,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use crate::receive_wal::WalReceivers;
use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
INVALID_TERM,
@@ -37,8 +38,8 @@ use crate::wal_backup::{self};
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
use crate::metrics::FullTimelineInfo;
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
use crate::{debug_dump, timeline_manager, wal_storage};
use crate::wal_storage::Storage as wal_storage_iface;
use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
use crate::{GlobalTimelines, SafeKeeperConf};
/// Things safekeeper should know about timeline state on peers.
@@ -168,6 +169,7 @@ pub struct SharedState {
pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
/// In memory list containing state of peers sent in latest messages from them.
pub(crate) peers_info: PeersInfo,
pub(crate) last_removed_segno: XLogSegNo,
}
impl SharedState {
@@ -195,33 +197,33 @@ impl SharedState {
// We don't want to write anything to disk, because we may have existing timeline there.
// These functions should not change anything on disk.
let timeline_dir = get_timeline_dir(conf, ttid);
let control_store =
control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
let timeline_dir = conf.timeline_dir(ttid);
let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
let wal_store =
wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
Ok(Self {
sk,
peers_info: PeersInfo(vec![]),
last_removed_segno: 0,
})
}
/// Restore SharedState from control file. If file doesn't exist, bails out.
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
let timeline_dir = get_timeline_dir(conf, ttid);
let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
if control_store.server.wal_seg_size == 0 {
bail!(TimelineError::UninitializedWalSegSize(*ttid));
}
let wal_store =
wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
Ok(Self {
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
peers_info: PeersInfo(vec![]),
last_removed_segno: 0,
})
}
@@ -242,7 +244,7 @@ impl SharedState {
timeline_id: ttid.timeline_id.as_ref().to_owned(),
}),
term: self.sk.state.acceptor_state.term,
last_log_term: self.sk.get_last_log_term(),
last_log_term: self.sk.get_epoch(),
flush_lsn: self.sk.flush_lsn().0,
// note: this value is not flushed to control file yet and can be lost
commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -273,6 +275,24 @@ impl SharedState {
.cloned()
.collect()
}
/// Get oldest segno we still need to keep. We hold WAL till it is consumed
/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
/// offloading.
/// While it is safe to use inmem values for determining horizon,
/// we use persistent to make possible normal states less surprising.
fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
let state = &self.sk.state;
use std::cmp::min;
let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
// we don't want to remove WAL that is not yet offloaded to s3
horizon_lsn = min(horizon_lsn, state.backup_lsn);
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
}
horizon_lsn.segment_number(state.server.wal_seg_size as usize)
}
}
#[derive(Debug, thiserror::Error)]
@@ -329,15 +349,22 @@ pub struct Timeline {
mutex: RwLock<SharedState>,
walsenders: Arc<WalSenders>,
walreceivers: Arc<WalReceivers>,
timeline_dir: Utf8PathBuf,
/// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
pub(crate) cancel: CancellationToken,
/// Directory where timeline state is stored.
pub timeline_dir: Utf8PathBuf,
/// Should we keep WAL on disk for active replication connections.
/// Especially useful for sharding, when different shards process WAL
/// with different speed.
// TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
walsenders_keep_horizon: bool,
// timeline_manager controlled state
pub(crate) broker_active: AtomicBool,
pub(crate) wal_backup_active: AtomicBool,
pub(crate) last_removed_segno: AtomicU64,
}
impl Timeline {
@@ -367,10 +394,10 @@ impl Timeline {
walsenders: WalSenders::new(walreceivers.clone()),
walreceivers,
cancel: CancellationToken::default(),
timeline_dir: get_timeline_dir(conf, &ttid),
timeline_dir: conf.timeline_dir(&ttid),
walsenders_keep_horizon: conf.walsenders_keep_horizon,
broker_active: AtomicBool::new(false),
wal_backup_active: AtomicBool::new(false),
last_removed_segno: AtomicU64::new(0),
})
}
@@ -403,10 +430,10 @@ impl Timeline {
walsenders: WalSenders::new(walreceivers.clone()),
walreceivers,
cancel: CancellationToken::default(),
timeline_dir: get_timeline_dir(conf, &ttid),
timeline_dir: conf.timeline_dir(&ttid),
walsenders_keep_horizon: conf.walsenders_keep_horizon,
broker_active: AtomicBool::new(false),
wal_backup_active: AtomicBool::new(false),
last_removed_segno: AtomicU64::new(0),
})
}
@@ -467,6 +494,15 @@ impl Timeline {
conf.clone(),
broker_active_set,
));
// Start recovery task which always runs on the timeline.
if conf.peer_recovery_enabled {
tokio::spawn(recovery_main(self.clone(), conf.clone()));
}
// TODO: migrate to timeline_manager
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
}
}
/// Delete timeline from disk completely, by removing timeline directory.
@@ -519,6 +555,36 @@ impl Timeline {
self.mutex.read().await
}
/// Returns true if walsender should stop sending WAL to pageserver. We
/// terminate it if remote_consistent_lsn reached commit_lsn and there is no
/// computes. While there might be nothing to stream already, we learn about
/// remote_consistent_lsn update through replication feedback, and we want
/// to stop pushing to the broker if pageserver is fully caughtup.
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
if self.is_cancelled() {
return true;
}
let shared_state = self.read_shared_state().await;
if self.walreceivers.get_num() == 0 {
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
}
false
}
/// Ensure that current term is t, erroring otherwise, and lock the state.
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
let ss = self.read_shared_state().await;
if ss.sk.state.acceptor_state.term != t {
bail!(
"failed to acquire term {}, current term {}",
t,
ss.sk.state.acceptor_state.term
);
}
Ok(ss)
}
/// Returns commit_lsn watch channel.
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
self.commit_lsn_watch_rx.clone()
@@ -534,6 +600,28 @@ impl Timeline {
self.shared_state_version_rx.clone()
}
/// Pass arrived message to the safekeeper.
pub async fn process_msg(
self: &Arc<Self>,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let mut rmsg: Option<AcceptorProposerMessage>;
{
let mut shared_state = self.write_shared_state().await;
rmsg = shared_state.sk.process_msg(msg).await?;
// if this is AppendResponse, fill in proper hot standby feedback.
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
}
}
Ok(rmsg)
}
/// Returns wal_seg_size.
pub async fn get_wal_seg_size(&self) -> usize {
self.read_shared_state().await.get_wal_seg_size()
@@ -584,11 +672,97 @@ impl Timeline {
Ok(())
}
/// Update in memory remote consistent lsn.
pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
let mut shared_state = self.write_shared_state().await;
shared_state.sk.state.inmem.remote_consistent_lsn =
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
}
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
let shared_state = self.read_shared_state().await;
shared_state.get_peers(conf.heartbeat_timeout)
}
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
/// to fetch, and we can do that without running elections; 2) there is no
/// actively streaming compute, as we don't want to compete with it.
///
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
/// to its last_log_term so we are sure such a leader ever had been elected.
///
/// All possible donors are returned so that we could keep connection to the
/// current one if it is good even if it slightly lags behind.
///
/// Note that term conditions above might be not met, but safekeepers are
/// still not aligned on last flush_lsn. Generally in this case until
/// elections are run it is not possible to say which safekeeper should
/// recover from which one -- history which would be committed is different
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
/// Thus we don't try to predict it here.
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
let ss = self.read_shared_state().await;
let term = ss.sk.state.acceptor_state.term;
let last_log_term = ss.sk.get_epoch();
let flush_lsn = ss.sk.flush_lsn();
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
let mut peers = ss.get_peers(heartbeat_timeout);
// Sort by <last log term, lsn> pairs.
peers.sort_by(|p1, p2| {
let tl1 = TermLsn {
term: p1.last_log_term,
lsn: p1.flush_lsn,
};
let tl2 = TermLsn {
term: p2.last_log_term,
lsn: p2.flush_lsn,
};
tl2.cmp(&tl1) // desc
});
let num_streaming_computes = self.walreceivers.get_num_streaming();
let donors = if num_streaming_computes > 0 {
vec![] // If there is a streaming compute, don't try to recover to not intervene.
} else {
peers
.iter()
.filter_map(|candidate| {
// Are we interested in this candidate?
let candidate_tl = TermLsn {
term: candidate.last_log_term,
lsn: candidate.flush_lsn,
};
let my_tl = TermLsn {
term: last_log_term,
lsn: flush_lsn,
};
if my_tl < candidate_tl {
// Yes, we are interested. Can we pull from it without
// (re)running elections? It is possible if 1) his term
// is equal to his last_log_term so we could act on
// behalf of leader of this term (we must be sure he was
// ever elected) and 2) our term is not higher, or we'll refuse data.
if candidate.term == candidate.last_log_term && candidate.term >= term {
Some(Donor::from(candidate))
} else {
None
}
} else {
None
}
})
.collect()
};
RecoveryNeededInfo {
term,
last_log_term,
flush_lsn,
peers,
num_streaming_computes,
donors,
}
}
pub fn get_walsenders(&self) -> &Arc<WalSenders> {
&self.walsenders
}
@@ -602,6 +776,58 @@ impl Timeline {
self.read_shared_state().await.sk.wal_store.flush_lsn()
}
/// Delete WAL segments from disk that are no longer needed. This is determined
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
// If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
// This allows to get better read speed for pageservers that are lagging behind,
// at the cost of keeping more WAL on disk.
let replication_horizon_lsn = if self.walsenders_keep_horizon {
self.walsenders.laggard_lsn()
} else {
None
};
let horizon_segno: XLogSegNo;
let remover = {
let shared_state = self.read_shared_state().await;
horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
return Ok(()); // nothing to do
}
// release the lock before removing
shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
};
// delete old WAL files
remover.await?;
// update last_removed_segno
let mut shared_state = self.write_shared_state().await;
if shared_state.last_removed_segno != horizon_segno {
shared_state.last_removed_segno = horizon_segno;
} else {
shared_state.skip_update = true;
}
Ok(())
}
/// Persist control file if there is something to save and enough time
/// passed after the last save. This helps to keep remote_consistent_lsn up
/// to date so that storage nodes restart doesn't cause many pageserver ->
/// safekeeper reconnections.
pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
let mut guard = self.write_shared_state().await;
let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
guard.skip_update = !changed;
Ok(())
}
/// Gather timeline data for metrics.
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
if self.is_cancelled() {
@@ -617,8 +843,8 @@ impl Timeline {
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
epoch_start_lsn: state.sk.term_start_lsn,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.state.inmem.clone(),
persisted_state: state.sk.state.clone(),
flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -640,8 +866,8 @@ impl Timeline {
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
active: self.broker_active.load(Ordering::Relaxed),
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
epoch_start_lsn: state.sk.term_start_lsn,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.state.inmem.clone(),
write_lsn,
write_record_lsn,
@@ -663,110 +889,6 @@ impl Timeline {
state.sk.state.finish_change(&persistent_state).await?;
Ok(res)
}
/// Get the timeline guard for reading/writing WAL files.
/// TODO: if WAL files are not present on disk (evicted), they will be
/// downloaded from S3. Also there will logic for preventing eviction
/// while someone is holding FullAccessTimeline guard.
pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
Ok(FullAccessTimeline { tli: self.clone() })
}
}
/// This is a guard that allows to read/write disk timeline state.
/// All tasks that are using the disk should use this guard.
#[derive(Clone)]
pub struct FullAccessTimeline {
pub tli: Arc<Timeline>,
}
impl Deref for FullAccessTimeline {
type Target = Arc<Timeline>;
fn deref(&self) -> &Self::Target {
&self.tli
}
}
impl FullAccessTimeline {
/// Returns true if walsender should stop sending WAL to pageserver. We
/// terminate it if remote_consistent_lsn reached commit_lsn and there is no
/// computes. While there might be nothing to stream already, we learn about
/// remote_consistent_lsn update through replication feedback, and we want
/// to stop pushing to the broker if pageserver is fully caughtup.
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
if self.is_cancelled() {
return true;
}
let shared_state = self.read_shared_state().await;
if self.walreceivers.get_num() == 0 {
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
}
false
}
/// Ensure that current term is t, erroring otherwise, and lock the state.
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
let ss = self.read_shared_state().await;
if ss.sk.state.acceptor_state.term != t {
bail!(
"failed to acquire term {}, current term {}",
t,
ss.sk.state.acceptor_state.term
);
}
Ok(ss)
}
/// Pass arrived message to the safekeeper.
pub async fn process_msg(
&self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let mut rmsg: Option<AcceptorProposerMessage>;
{
let mut shared_state = self.write_shared_state().await;
rmsg = shared_state.sk.process_msg(msg).await?;
// if this is AppendResponse, fill in proper hot standby feedback.
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
}
}
Ok(rmsg)
}
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
let (_, persisted_state) = self.get_state().await;
let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
WalReader::new(
&self.ttid,
self.timeline_dir.clone(),
&persisted_state,
start_lsn,
enable_remote_read,
)
}
pub fn get_timeline_dir(&self) -> Utf8PathBuf {
self.timeline_dir.clone()
}
/// Update in memory remote consistent lsn.
pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
let mut shared_state = self.write_shared_state().await;
shared_state.sk.state.inmem.remote_consistent_lsn =
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
}
}
/// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -777,16 +899,3 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
Err(e) => Err(e.into()),
}
}
/// Get a path to the tenant directory. If you just need to get a timeline directory,
/// use FullAccessTimeline::get_timeline_dir instead.
pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
conf.workdir.join(tenant_id.to_string())
}
/// Get a path to the timeline directory. If you need to read WAL files from disk,
/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
/// timeline eviction status and WAL files might not be present on disk.
pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
}

View File

@@ -3,42 +3,23 @@
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
use std::{
sync::Arc,
time::{Duration, Instant},
};
use std::{sync::Arc, time::Duration};
use postgres_ffi::XLogSegNo;
use tokio::task::{JoinError, JoinHandle};
use tracing::{info, info_span, instrument, warn, Instrument};
use tracing::{info, instrument, warn};
use utils::lsn::Lsn;
use crate::{
control_file::Storage,
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
recovery::recovery_main,
remove_wal::calc_horizon_lsn,
send_wal::WalSenders,
timeline::{PeerInfo, ReadGuardSharedState, Timeline},
timelines_set::{TimelineSetGuard, TimelinesSet},
timelines_set::TimelinesSet,
wal_backup::{self, WalBackupTaskHandle},
wal_backup_partial, SafeKeeperConf,
SafeKeeperConf,
};
pub struct StateSnapshot {
// inmem values
pub commit_lsn: Lsn,
pub backup_lsn: Lsn,
pub remote_consistent_lsn: Lsn,
// persistent control file values
pub cfile_peer_horizon_lsn: Lsn,
pub cfile_remote_consistent_lsn: Lsn,
pub cfile_backup_lsn: Lsn,
// misc
pub cfile_last_persist_at: Instant,
pub inmem_flush_pending: bool,
pub peers: Vec<PeerInfo>,
}
@@ -49,34 +30,17 @@ impl StateSnapshot {
commit_lsn: read_guard.sk.state.inmem.commit_lsn,
backup_lsn: read_guard.sk.state.inmem.backup_lsn,
remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
cfile_backup_lsn: read_guard.sk.state.backup_lsn,
cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
peers: read_guard.get_peers(heartbeat_timeout),
}
}
fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
let state = &read_guard.sk.state;
state.inmem.commit_lsn > state.commit_lsn
|| state.inmem.backup_lsn > state.backup_lsn
|| state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
|| state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
}
}
/// Control how often the manager task should wake up to check updates.
/// There is no need to check for updates more often than this.
const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
/// How often to save the control file if the is no other activity.
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
/// background tasks.
/// Be careful, this task is not respawned on panic, so it should not panic.
#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
pub async fn main_task(
tli: Arc<Timeline>,
@@ -91,50 +55,20 @@ pub async fn main_task(
}
};
// configuration & dependencies
// sets whether timeline is active for broker pushes or not
let mut tli_broker_active = broker_active_set.guard(tli.clone());
let ttid = tli.ttid;
let wal_seg_size = tli.get_wal_seg_size().await;
let heartbeat_timeout = conf.heartbeat_timeout;
let walsenders = tli.get_walsenders();
let walreceivers = tli.get_walreceivers();
// current state
let mut state_version_rx = tli.get_state_version_rx();
let walreceivers = tli.get_walreceivers();
let mut num_computes_rx = walreceivers.get_num_rx();
let mut tli_broker_active = broker_active_set.guard(tli.clone());
let mut last_removed_segno = 0 as XLogSegNo;
// list of background tasks
let mut backup_task: Option<WalBackupTaskHandle> = None;
let mut recovery_task: Option<JoinHandle<()>> = None;
let mut partial_backup_task: Option<JoinHandle<()>> = None;
let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
// Start recovery task which always runs on the timeline.
if conf.peer_recovery_enabled {
match tli.full_access_guard().await {
Ok(tli) => {
recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
}
Err(e) => {
warn!("failed to start recovery task: {:?}", e);
}
}
}
// Start partial backup task which always runs on the timeline.
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
match tli.full_access_guard().await {
Ok(tli) => {
partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
tli,
conf.clone(),
)));
}
Err(e) => {
warn!("failed to start partial backup task: {:?}", e);
}
}
}
let last_state = 'outer: loop {
MANAGER_ITERATIONS_TOTAL.inc();
@@ -142,36 +76,47 @@ pub async fn main_task(
let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
let num_computes = *num_computes_rx.borrow();
let is_wal_backup_required = update_backup(
&conf,
&tli,
wal_seg_size,
num_computes,
&state_snapshot,
&mut backup_task,
)
.await;
let is_wal_backup_required =
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
let _is_active = update_is_active(
is_wal_backup_required,
num_computes,
&state_snapshot,
&mut tli_broker_active,
&tli,
);
if conf.is_wal_backup_enabled() {
wal_backup::update_task(
&conf,
ttid,
is_wal_backup_required,
&state_snapshot,
&mut backup_task,
)
.await;
}
let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
let is_active = is_wal_backup_required
|| num_computes > 0
|| state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
update_wal_removal(
&conf,
walsenders,
&tli,
wal_seg_size,
&state_snapshot,
last_removed_segno,
&mut wal_removal_task,
)
.await;
// update the broker timeline set
if tli_broker_active.set(is_active) {
// write log if state has changed
info!(
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
);
MANAGER_ACTIVE_CHANGES.inc();
if !is_active {
// TODO: maybe use tokio::spawn?
if let Err(e) = tli.maybe_persist_control_file(false).await {
warn!("control file save in update_status failed: {:?}", e);
}
}
}
// update the state in Arc<Timeline>
tli.wal_backup_active
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
tli.broker_active
.store(is_active, std::sync::atomic::Ordering::Relaxed);
// wait until something changes. tx channels are stored under Arc, so they will not be
// dropped until the manager task is finished.
@@ -190,192 +135,11 @@ pub async fn main_task(
_ = num_computes_rx.changed() => {
// number of connected computes was updated
}
_ = async {
if let Some(timeout) = next_cfile_save {
tokio::time::sleep_until(timeout).await
} else {
futures::future::pending().await
}
} => {
// it's time to save the control file
}
res = async {
if let Some(task) = &mut wal_removal_task {
task.await
} else {
futures::future::pending().await
}
} => {
// WAL removal task finished
wal_removal_task = None;
update_wal_removal_end(res, &tli, &mut last_removed_segno);
}
}
};
// shutdown background tasks
if conf.is_wal_backup_enabled() {
wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
}
if let Some(recovery_task) = recovery_task {
if let Err(e) = recovery_task.await {
warn!("recovery task failed: {:?}", e);
}
}
if let Some(partial_backup_task) = partial_backup_task {
if let Err(e) = partial_backup_task.await {
warn!("partial backup task failed: {:?}", e);
}
}
if let Some(wal_removal_task) = wal_removal_task {
let res = wal_removal_task.await;
update_wal_removal_end(res, &tli, &mut last_removed_segno);
wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
}
}
/// Spawns/kills backup task and returns true if backup is required.
async fn update_backup(
conf: &SafeKeeperConf,
tli: &Arc<Timeline>,
wal_seg_size: usize,
num_computes: usize,
state: &StateSnapshot,
backup_task: &mut Option<WalBackupTaskHandle>,
) -> bool {
let is_wal_backup_required =
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
if conf.is_wal_backup_enabled() {
wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
}
// update the state in Arc<Timeline>
tli.wal_backup_active
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
is_wal_backup_required
}
/// Update is_active flag and returns its value.
fn update_is_active(
is_wal_backup_required: bool,
num_computes: usize,
state: &StateSnapshot,
tli_broker_active: &mut TimelineSetGuard,
tli: &Arc<Timeline>,
) -> bool {
let is_active = is_wal_backup_required
|| num_computes > 0
|| state.remote_consistent_lsn < state.commit_lsn;
// update the broker timeline set
if tli_broker_active.set(is_active) {
// write log if state has changed
info!(
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
is_active, state.remote_consistent_lsn, state.commit_lsn,
);
MANAGER_ACTIVE_CHANGES.inc();
}
// update the state in Arc<Timeline>
tli.broker_active
.store(is_active, std::sync::atomic::Ordering::Relaxed);
is_active
}
/// Save control file if needed. Returns Instant if we should persist the control file in the future.
async fn update_control_file_save(
state: &StateSnapshot,
tli: &Arc<Timeline>,
) -> Option<tokio::time::Instant> {
if !state.inmem_flush_pending {
return None;
}
if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
let mut write_guard = tli.write_shared_state().await;
// this can be done in the background because it blocks manager task, but flush() should
// be fast enough not to be a problem now
if let Err(e) = write_guard.sk.state.flush().await {
warn!("failed to save control file: {:?}", e);
}
None
} else {
// we should wait until next CF_SAVE_INTERVAL
Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
}
}
/// Spawns WAL removal task if needed.
async fn update_wal_removal(
conf: &SafeKeeperConf,
walsenders: &Arc<WalSenders>,
tli: &Arc<Timeline>,
wal_seg_size: usize,
state: &StateSnapshot,
last_removed_segno: u64,
wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
) {
if wal_removal_task.is_some() {
// WAL removal is already in progress
return;
}
// If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
// This allows to get better read speed for pageservers that are lagging behind,
// at the cost of keeping more WAL on disk.
let replication_horizon_lsn = if conf.walsenders_keep_horizon {
walsenders.laggard_lsn()
} else {
None
};
let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
let removal_horizon_segno = removal_horizon_lsn
.segment_number(wal_seg_size)
.saturating_sub(1);
if removal_horizon_segno > last_removed_segno {
// we need to remove WAL
let remover = crate::wal_storage::Storage::remove_up_to(
&tli.read_shared_state().await.sk.wal_store,
removal_horizon_segno,
);
*wal_removal_task = Some(tokio::spawn(
async move {
remover.await?;
Ok(removal_horizon_segno)
}
.instrument(info_span!("WAL removal", ttid=%tli.ttid)),
));
}
}
/// Update the state after WAL removal task finished.
fn update_wal_removal_end(
res: Result<anyhow::Result<u64>, JoinError>,
tli: &Arc<Timeline>,
last_removed_segno: &mut u64,
) {
let new_last_removed_segno = match res {
Ok(Ok(segno)) => segno,
Err(e) => {
warn!("WAL removal task failed: {:?}", e);
return;
}
Ok(Err(e)) => {
warn!("WAL removal task failed: {:?}", e);
return;
}
};
*last_removed_segno = new_last_removed_segno;
// update the state in Arc<Timeline>
tli.last_removed_segno
.store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
}

View File

@@ -3,7 +3,7 @@
//! all from the disk on startup and keeping them in memory.
use crate::safekeeper::ServerInfo;
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
use crate::timeline::{Timeline, TimelineError};
use crate::timelines_set::TimelinesSet;
use crate::SafeKeeperConf;
use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
state.get_dependencies()
};
let timelines_dir = get_tenant_dir(&conf, &tenant_id);
let timelines_dir = conf.tenant_dir(&tenant_id);
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
.with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
{
@@ -348,7 +348,11 @@ impl GlobalTimelines {
}
Err(_) => {
// Timeline is not memory, but it may still exist on disk in broken state.
let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
let dir_path = TIMELINES_STATE
.lock()
.unwrap()
.get_conf()
.timeline_dir(ttid);
let dir_existed = delete_dir(dir_path)?;
Ok(TimelineDeleteForceResult {
@@ -397,10 +401,13 @@ impl GlobalTimelines {
// Note that we could concurrently create new timelines while we were deleting them,
// so the directory may be not empty. In this case timelines will have bad state
// and timeline background jobs can panic.
delete_dir(get_tenant_dir(
TIMELINES_STATE.lock().unwrap().get_conf(),
tenant_id,
))?;
delete_dir(
TIMELINES_STATE
.lock()
.unwrap()
.get_conf()
.tenant_dir(tenant_id),
)?;
// FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
// let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);

View File

@@ -30,9 +30,9 @@ use tracing::*;
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
use crate::timeline::{PeerInfo, Timeline};
use crate::timeline_manager::StateSnapshot;
use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
use once_cell::sync::OnceCell;
@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
/// is running, kill it.
pub async fn update_task(
conf: &SafeKeeperConf,
tli: &Arc<Timeline>,
ttid: TenantTimelineId,
need_backup: bool,
state: &StateSnapshot,
entry: &mut Option<WalBackupTaskHandle>,
) {
let (offloader, election_dbg_str) =
determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
let elected_me = Some(conf.my_id) == offloader;
let should_task_run = need_backup && elected_me;
@@ -80,8 +80,15 @@ pub async fn update_task(
info!("elected for backup: {}", election_dbg_str);
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
let timeline_dir = conf.timeline_dir(&ttid);
let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
let async_task = backup_task_main(
ttid,
timeline_dir,
conf.workdir.clone(),
conf.backup_parallel_jobs,
shutdown_rx,
);
let handle = if conf.current_thread_runtime {
tokio::spawn(async_task)
@@ -191,32 +198,39 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
}
struct WalBackupTask {
timeline: FullAccessTimeline,
timeline: Arc<Timeline>,
timeline_dir: Utf8PathBuf,
workspace_dir: Utf8PathBuf,
wal_seg_size: usize,
parallel_jobs: usize,
commit_lsn_watch_rx: watch::Receiver<Lsn>,
}
/// Offload single timeline.
#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
async fn backup_task_main(
ttid: TenantTimelineId,
timeline_dir: Utf8PathBuf,
workspace_dir: Utf8PathBuf,
parallel_jobs: usize,
mut shutdown_rx: Receiver<()>,
) {
let _guard = WAL_BACKUP_TASKS.guard();
let tli = match tli.full_access_guard().await {
Ok(tli) => tli,
Err(e) => {
error!("backup error: {}", e);
return;
}
};
info!("started");
let res = GlobalTimelines::get(ttid);
if let Err(e) = res {
error!("backup error: {}", e);
return;
}
let tli = res.unwrap();
let mut wb = WalBackupTask {
wal_seg_size: tli.get_wal_seg_size().await,
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
timeline_dir: tli.get_timeline_dir(),
timeline: tli,
timeline_dir,
workspace_dir,
parallel_jobs,
};
@@ -283,6 +297,7 @@ impl WalBackupTask {
commit_lsn,
self.wal_seg_size,
&self.timeline_dir,
&self.workspace_dir,
self.parallel_jobs,
)
.await
@@ -304,18 +319,18 @@ impl WalBackupTask {
}
async fn backup_lsn_range(
timeline: &FullAccessTimeline,
timeline: &Arc<Timeline>,
backup_lsn: &mut Lsn,
end_lsn: Lsn,
wal_seg_size: usize,
timeline_dir: &Utf8Path,
workspace_dir: &Utf8Path,
parallel_jobs: usize,
) -> Result<()> {
if parallel_jobs < 1 {
anyhow::bail!("parallel_jobs must be >= 1");
}
let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
let start_lsn = *backup_lsn;
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
@@ -328,11 +343,7 @@ async fn backup_lsn_range(
loop {
let added_task = match iter.next() {
Some(s) => {
uploads.push_back(backup_single_segment(
s,
timeline_dir,
&remote_timeline_path,
));
uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
true
}
None => false,
@@ -370,10 +381,18 @@ async fn backup_lsn_range(
async fn backup_single_segment(
seg: &Segment,
timeline_dir: &Utf8Path,
remote_timeline_path: &RemotePath,
workspace_dir: &Utf8Path,
) -> Result<Segment> {
let segment_file_path = seg.file_path(timeline_dir)?;
let remote_segment_path = seg.remote_path(remote_timeline_path);
let remote_segment_path = segment_file_path
.strip_prefix(workspace_dir)
.context("Failed to strip workspace dir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
)
})?;
let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
if res.is_ok() {
@@ -411,10 +430,6 @@ impl Segment {
Ok(timeline_dir.join(self.object_name()))
}
pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
remote_timeline_path.join(self.object_name())
}
pub fn size(self) -> usize {
(u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
}
@@ -515,7 +530,8 @@ pub async fn read_object(
/// when called.
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
let storage = get_configured_remote_storage();
let remote_path = remote_timeline_path(ttid)?;
let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
let remote_path = RemotePath::new(&ttid_path)?;
// see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
// const Option unwrap is not stable, otherwise it would be const.
@@ -597,17 +613,15 @@ pub async fn copy_s3_segments(
.as_ref()
.unwrap();
let remote_dst_path = remote_timeline_path(dst_ttid)?;
let relative_dst_path =
Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
let remote_path = RemotePath::new(&relative_dst_path)?;
let cancel = CancellationToken::new();
let files = storage
.list(
Some(&remote_dst_path),
ListingMode::NoDelimiter,
None,
&cancel,
)
.list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
.await?
.keys;
@@ -621,6 +635,9 @@ pub async fn copy_s3_segments(
uploaded_segments
);
let relative_src_path =
Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
for segno in from_segment..to_segment {
if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
info!("copied all segments from {} until {}", from_segment, segno);
@@ -632,8 +649,8 @@ pub async fn copy_s3_segments(
}
debug!("copying segment {}", segment_name);
let from = remote_timeline_path(src_ttid)?.join(&segment_name);
let to = remote_dst_path.join(&segment_name);
let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
storage.copy_object(&from, &to, &cancel).await?;
}
@@ -644,8 +661,3 @@ pub async fn copy_s3_segments(
);
Ok(())
}
/// Get S3 (remote_storage) prefix path used for timeline files.
pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
}

View File

@@ -18,21 +18,22 @@
//! This way control file stores information about all potentially existing
//! remote partial segments and can clean them up after uploading a newer version.
use std::sync::Arc;
use camino::Utf8PathBuf;
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
use rand::Rng;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use tracing::{debug, error, info, instrument, warn};
use tracing::{debug, error, info, instrument};
use utils::lsn::Lsn;
use crate::{
metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
safekeeper::Term,
timeline::FullAccessTimeline,
wal_backup::{self, remote_timeline_path},
SafeKeeperConf,
timeline::Timeline,
wal_backup, SafeKeeperConf,
};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -82,10 +83,10 @@ impl State {
struct PartialBackup {
wal_seg_size: usize,
tli: FullAccessTimeline,
tli: Arc<Timeline>,
conf: SafeKeeperConf,
local_prefix: Utf8PathBuf,
remote_timeline_path: RemotePath,
remote_prefix: Utf8PathBuf,
state: State,
}
@@ -152,7 +153,7 @@ impl PartialBackup {
let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
let local_path = self.local_prefix.join(self.local_segment_name(segno));
let remote_path = self.remote_timeline_path.join(&prepared.name);
let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
// Upload first `backup_bytes` bytes of the segment to the remote storage.
wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -252,7 +253,7 @@ impl PartialBackup {
info!("deleting objects: {:?}", segments_to_delete);
let mut objects_to_delete = vec![];
for seg in segments_to_delete.iter() {
let remote_path = self.remote_timeline_path.join(seg);
let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
objects_to_delete.push(remote_path);
}
@@ -272,7 +273,7 @@ impl PartialBackup {
}
#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
debug!("started");
let await_duration = conf.partial_backup_timeout;
@@ -288,11 +289,11 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
let wal_seg_size = tli.get_wal_seg_size().await;
let local_prefix = tli.get_timeline_dir();
let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
Ok(path) => path,
let local_prefix = tli.timeline_dir.clone();
let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
Ok(path) => path.to_owned(),
Err(e) => {
error!("failed to create remote path: {:?}", e);
error!("failed to strip workspace dir prefix: {:?}", e);
return;
}
};
@@ -303,28 +304,12 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
state: persistent_state.partial_backup,
conf,
local_prefix,
remote_timeline_path,
remote_prefix,
};
debug!("state: {:?}", backup.state);
// The general idea is that each safekeeper keeps only one partial segment
// both in remote storage and in local state. If this is not true, something
// went wrong.
const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
'outer: loop {
if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
warn!(
"too many segments in control_file state, running gc: {}",
backup.state.segments.len()
);
backup.gc().await.unwrap_or_else(|e| {
error!("failed to run gc: {:#}", e);
});
}
// wait until we have something to upload
let uploaded_segment = backup.state.uploaded_segment();
if let Some(seg) = &uploaded_segment {

View File

@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
use crate::state::TimelinePersistentState;
use crate::wal_backup::{read_object, remote_timeline_path};
use crate::wal_backup::read_object;
use crate::SafeKeeperConf;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
}
pub struct WalReader {
remote_path: RemotePath,
workdir: Utf8PathBuf,
timeline_dir: Utf8PathBuf,
wal_seg_size: usize,
pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {
impl WalReader {
pub fn new(
ttid: &TenantTimelineId,
workdir: Utf8PathBuf,
timeline_dir: Utf8PathBuf,
state: &TimelinePersistentState,
start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
}
Ok(Self {
remote_path: remote_timeline_path(ttid)?,
workdir,
timeline_dir,
wal_seg_size: state.server.wal_seg_size as usize,
pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
let xlogoff = self.pos.segment_offset(self.wal_seg_size);
let segno = self.pos.segment_number(self.wal_seg_size);
let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
let wal_file_path = self.timeline_dir.join(&wal_file_name);
let wal_file_path = self.timeline_dir.join(wal_file_name);
// Try to open local file, if we may have WAL locally
if self.pos >= self.local_start_lsn {
@@ -712,7 +712,16 @@ impl WalReader {
// Try to open remote file, if remote reads are enabled
if self.enable_remote_read {
let remote_wal_file_path = self.remote_path.join(&wal_file_name);
let remote_wal_file_path = wal_file_path
.strip_prefix(&self.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {:?} for base {:?}",
wal_file_path, self.workdir,
)
})?;
return read_object(&remote_wal_file_path, xlogoff as u64).await;
}

View File

@@ -72,18 +72,6 @@ class Lsn:
def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
return self.lsn_int // seg_sz
def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
segno = self.segno(seg_sz)
# The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
# XXXXXXXX is the higher 8 hex digits of segno
high_bits = segno >> 8
# YY is the lower 2 hex digits of segno
low_bits = segno & 0xFF
return f"00000001{high_bits:08X}000000{low_bits:02X}"
@dataclass(frozen=True)
class Key:

View File

@@ -973,9 +973,6 @@ class NeonEnvBuilder:
for pageserver in self.env.pageservers:
pageserver.assert_no_errors()
for safekeeper in self.env.safekeepers:
safekeeper.assert_no_errors()
self.env.storage_controller.assert_no_errors()
try:
@@ -3816,9 +3813,6 @@ class Safekeeper(LogUtils):
self.running = False
return self
def assert_no_errors(self):
assert not self.log_contains("manager task finished prematurely")
def append_logical_message(
self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
) -> Dict[str, Any]:
@@ -3904,15 +3898,6 @@ class Safekeeper(LogUtils):
"""
cli = self.http_client()
target_segment_file = lsn.segment_name()
def are_segments_removed():
segments = self.list_segments(tenant_id, timeline_id)
log.info(
f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
)
assert all(target_segment_file <= s for s in segments)
def are_lsns_advanced():
stat = cli.timeline_status(tenant_id, timeline_id)
log.info(
@@ -3924,7 +3909,6 @@ class Safekeeper(LogUtils):
# pageserver to this safekeeper
wait_until(30, 1, are_lsns_advanced)
cli.checkpoint(tenant_id, timeline_id)
wait_until(30, 1, are_segments_removed)
def wait_until_paused(self, failpoint: str):
msg = f"at failpoint {failpoint}"

View File

@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
# these can happen anytime we do compactions from background task and shutdown pageserver
".*could not compact.*cancelled.*",
r".*ERROR.*ancestor timeline \S+ is being stopped",
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
".*Compaction failed.*, retrying in .*: ShuttingDown",

View File

@@ -19,8 +19,7 @@ class Walreceiver:
@dataclass
class SafekeeperTimelineStatus:
term: int
last_log_term: int
acceptor_epoch: int
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
flush_lsn: Lsn
commit_lsn: Lsn
@@ -157,8 +156,7 @@ class SafekeeperHttpClient(requests.Session):
resj = res.json()
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
return SafekeeperTimelineStatus(
term=resj["acceptor_state"]["term"],
last_log_term=resj["acceptor_state"]["epoch"],
acceptor_epoch=resj["acceptor_state"]["epoch"],
pg_version=resj["pg_info"]["pg_version"],
flush_lsn=Lsn(resj["flush_lsn"]),
commit_lsn=Lsn(resj["commit_lsn"]),

View File

@@ -1,47 +0,0 @@
\set ECHO queries
\timing
-- prepare test table
DROP TABLE IF EXISTS hnsw_test_table;
CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
INSERT INTO hnsw_test_table SELECT * FROM documents;
CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
-- tune index build params
SET max_parallel_maintenance_workers = 7;
SET maintenance_work_mem = '8GB';
-- create HNSW index for the supported distance metrics
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
-- note: in a second psql session we can monitor the progress of the index build phases using
-- the following query:
-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
-- show all indexes built on the table
SELECT
idx.relname AS index_name,
tbl.relname AS table_name,
am.amname AS access_method,
a.attname AS column_name,
opc.opcname AS operator_class
FROM
pg_index i
JOIN
pg_class idx ON idx.oid = i.indexrelid
JOIN
pg_class tbl ON tbl.oid = i.indrelid
JOIN
pg_am am ON am.oid = idx.relam
JOIN
pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
JOIN
pg_opclass opc ON opc.oid = i.indclass[0]
WHERE
tbl.relname = 'hnsw_test_table'
AND a.attname = 'embeddings';
-- show table sizes
\dt+

View File

@@ -1,52 +0,0 @@
\set ECHO queries
\timing
-- prepare test table
DROP TABLE IF EXISTS ivfflat_test_table;
CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
INSERT INTO ivfflat_test_table SELECT * FROM documents;
CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
-- tune index build params
SET max_parallel_maintenance_workers = 7;
SET maintenance_work_mem = '8GB';
-- create ivfflat index for the supported distance metrics
-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
-- so we use 1000 lists
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table
USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
\d ivfflat_test_table
-- show all indexes built on the table
SELECT
idx.relname AS index_name,
tbl.relname AS table_name,
am.amname AS access_method,
a.attname AS column_name,
opc.opcname AS operator_class
FROM
pg_index i
JOIN
pg_class idx ON idx.oid = i.indexrelid
JOIN
pg_class tbl ON tbl.oid = i.indrelid
JOIN
pg_am am ON am.oid = idx.relam
JOIN
pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
JOIN
pg_opclass opc ON opc.oid = i.indclass[0]
WHERE
tbl.relname = 'ivfflat_test_table'
AND a.attname = 'embeddings';
-- show table sizes
\dt+

View File

@@ -1,55 +0,0 @@
# Source of the dataset for pgvector tests
This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
## Download the parquet files
```bash
brew install git-lfs
git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
```
## Load into postgres:
see loaddata.py in this directory
## Rest of dataset card as on huggingface
---
dataset_info:
features:
- name: _id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
- name: text-embedding-3-large-1536-embedding
sequence: float64
splits:
- name: train
num_bytes: 12679725776
num_examples: 1000000
download_size: 9551862565
dataset_size: 12679725776
configs:
- config_name: default
data_files:
- split: train
path: data/train-*
license: mit
task_categories:
- feature-extraction
language:
- en
size_categories:
- 1M<n<10M
---
1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
- Created: February 2024.
- Text used for Embedding: title (string) + text (string)
- Embedding Model: OpenAI text-embedding-3-large
- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_

View File

@@ -1,72 +0,0 @@
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values
def print_usage():
print("Usage: loaddata.py <CONNSTR> <DATADIR>")
def main(conn_str, directory_path):
# Connection to PostgreSQL
with psycopg2.connect(conn_str) as conn:
with conn.cursor() as cursor:
# Run SQL statements
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS documents;")
cursor.execute(
"""
CREATE TABLE documents (
_id TEXT PRIMARY KEY,
title TEXT,
text TEXT,
embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
);
"""
)
conn.commit()
# List and sort Parquet files
parquet_files = sorted(Path(directory_path).glob("*.parquet"))
for file in parquet_files:
print(f"Loading {file} into PostgreSQL")
df = pd.read_parquet(file)
print(df.head())
data_list = [
(
row["_id"],
row["title"],
row["text"],
np.array(row["text-embedding-3-large-1536-embedding"]),
)
for index, row in df.iterrows()
]
# Use execute_values to perform batch insertion
execute_values(
cursor,
"INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
data_list,
)
# Commit after we insert all embeddings
conn.commit()
print(f"Loaded {file} into PostgreSQL")
if __name__ == "__main__":
if len(sys.argv) != 3:
print_usage()
sys.exit(1)
conn_str = sys.argv[1]
directory_path = sys.argv[2]
main(conn_str, directory_path)

View File

@@ -1,10 +0,0 @@
with x (x) as (
select "embeddings" as x
from hnsw_test_table
TABLESAMPLE SYSTEM (1)
LIMIT 1
)
SELECT title, "embeddings" <=> (select x from x) as distance
FROM hnsw_test_table
ORDER BY 2
LIMIT 30;

Some files were not shown because too many files have changed in this diff Show More