mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-30 18:50:36 +00:00
Compare commits
38 Commits
mx_offset_
...
al/support
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12f0c0ec8f | ||
|
|
13bd44a1f0 | ||
|
|
cda148d40d | ||
|
|
1aad8918e1 | ||
|
|
966213f429 | ||
|
|
35e73759f5 | ||
|
|
48936d44f8 | ||
|
|
2eae0a1fe5 | ||
|
|
53470ad12a | ||
|
|
edccef4514 | ||
|
|
982fce1e72 | ||
|
|
e767ced8d0 | ||
|
|
1309571f5d | ||
|
|
9a69b6cb94 | ||
|
|
cc82cd1b07 | ||
|
|
c76b74c50d | ||
|
|
ed938885ff | ||
|
|
db4d094afa | ||
|
|
0626e0bfd3 | ||
|
|
444d6e337f | ||
|
|
3a1be9b246 | ||
|
|
664d32eb7f | ||
|
|
ed845b644b | ||
|
|
87dd37a2f2 | ||
|
|
1355bd0ac5 | ||
|
|
a1d6b1a4af | ||
|
|
92aee7e07f | ||
|
|
5e2f29491f | ||
|
|
618d36ee6d | ||
|
|
33c2d94ba6 | ||
|
|
08bfe1c826 | ||
|
|
65ff256bb8 | ||
|
|
5177c1e4b1 | ||
|
|
49efcc3773 | ||
|
|
76b1cdc17e | ||
|
|
1f151d03d8 | ||
|
|
ac758e4f51 | ||
|
|
4f280c2953 |
@@ -12,6 +12,11 @@ opt-level = 3
|
|||||||
# Turn on a small amount of optimization in Development mode.
|
# Turn on a small amount of optimization in Development mode.
|
||||||
opt-level = 1
|
opt-level = 1
|
||||||
|
|
||||||
|
[build]
|
||||||
|
# This is only present for local builds, as it will be overridden
|
||||||
|
# by the RUSTDOCFLAGS env var in CI.
|
||||||
|
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
|
||||||
|
|
||||||
[alias]
|
[alias]
|
||||||
build_testing = ["build", "--features", "testing"]
|
build_testing = ["build", "--features", "testing"]
|
||||||
neon = ["run", "--bin", "neon_local"]
|
neon = ["run", "--bin", "neon_local"]
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
!trace/
|
!trace/
|
||||||
!vendor/postgres-v14/
|
!vendor/postgres-v14/
|
||||||
!vendor/postgres-v15/
|
!vendor/postgres-v15/
|
||||||
|
!vendor/postgres-v16/
|
||||||
!workspace_hack/
|
!workspace_hack/
|
||||||
!neon_local/
|
!neon_local/
|
||||||
!scripts/ninstall.sh
|
!scripts/ninstall.sh
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ runs:
|
|||||||
# Get previously uploaded data for this run
|
# Get previously uploaded data for this run
|
||||||
ZSTD_NBTHREADS=0
|
ZSTD_NBTHREADS=0
|
||||||
|
|
||||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key')
|
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key')
|
||||||
if [ -z "$S3_FILEPATHS" ]; then
|
if [ -z "$S3_FILEPATHS" ]; then
|
||||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
name: Handle `approved-for-ci-run` label
|
||||||
|
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
types:
|
||||||
|
# Default types that triggers a workflow ([1]):
|
||||||
|
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
|
||||||
|
- opened
|
||||||
|
- synchronize
|
||||||
|
- reopened
|
||||||
|
# Types that we wand to handle in addition to keep labels tidy:
|
||||||
|
- closed
|
||||||
|
# Actual magic happens here:
|
||||||
|
- labeled
|
||||||
|
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
remove-label:
|
||||||
|
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
|
||||||
|
# The PR should be reviewed and labelled manually again.
|
||||||
|
|
||||||
|
runs-on: [ ubuntu-latest ]
|
||||||
|
|
||||||
|
if: |
|
||||||
|
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||||
|
|
||||||
|
create-branch:
|
||||||
|
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||||
|
|
||||||
|
runs-on: [ ubuntu-latest ]
|
||||||
|
|
||||||
|
if: |
|
||||||
|
github.event.action == 'labeled' &&
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||||
|
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
ref: main
|
||||||
|
|
||||||
|
- run: gh pr checkout "${PR_NUMBER}"
|
||||||
|
|
||||||
|
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
|
||||||
|
|
||||||
|
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"
|
||||||
39
.github/workflows/build_and_test.yml
vendored
39
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
- release
|
- release
|
||||||
|
- ci-run/pr-*
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
@@ -127,6 +128,11 @@ jobs:
|
|||||||
- name: Run cargo clippy (release)
|
- name: Run cargo clippy (release)
|
||||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||||
|
|
||||||
|
- name: Check documentation generation
|
||||||
|
run: cargo doc --workspace --no-deps --document-private-items
|
||||||
|
env:
|
||||||
|
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||||
|
|
||||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||||
- name: Check formatting
|
- name: Check formatting
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
@@ -155,7 +161,7 @@ jobs:
|
|||||||
build_type: [ debug, release ]
|
build_type: [ debug, release ]
|
||||||
env:
|
env:
|
||||||
BUILD_TYPE: ${{ matrix.build_type }}
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
GIT_VERSION: ${{ github.sha }}
|
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Fix git ownership
|
- name: Fix git ownership
|
||||||
@@ -174,6 +180,27 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 1
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Check Postgres submodules revision
|
||||||
|
shell: bash -euo pipefail {0}
|
||||||
|
run: |
|
||||||
|
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
||||||
|
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
||||||
|
|
||||||
|
FAILED=false
|
||||||
|
for postgres in postgres-v14 postgres-v15; do
|
||||||
|
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
||||||
|
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
||||||
|
if [ "${expected}" != "${actual}" ]; then
|
||||||
|
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
||||||
|
FAILED=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${FAILED}" = "true" ]; then
|
||||||
|
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
- name: Set pg 14 revision for caching
|
||||||
id: pg_v14_rev
|
id: pg_v14_rev
|
||||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||||
@@ -614,7 +641,7 @@ jobs:
|
|||||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||||
@@ -658,7 +685,7 @@ jobs:
|
|||||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--dockerfile Dockerfile.compute-tools
|
--dockerfile Dockerfile.compute-tools
|
||||||
@@ -715,7 +742,7 @@ jobs:
|
|||||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg PG_VERSION=${{ matrix.version }}
|
--build-arg PG_VERSION=${{ matrix.version }}
|
||||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
@@ -742,7 +769,7 @@ jobs:
|
|||||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
||||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||||
--context . \
|
--context . \
|
||||||
--build-arg GIT_VERSION=${{ github.sha }} \
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
|
||||||
--build-arg PG_VERSION=${{ matrix.version }} \
|
--build-arg PG_VERSION=${{ matrix.version }} \
|
||||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
||||||
@@ -767,7 +794,7 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
env:
|
env:
|
||||||
VM_BUILDER_VERSION: v0.11.1
|
VM_BUILDER_VERSION: v0.13.1
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
3
.github/workflows/neon_extra_builds.yml
vendored
3
.github/workflows/neon_extra_builds.yml
vendored
@@ -3,7 +3,8 @@ name: Check neon with extra platform builds
|
|||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
- ci-run/pr-*
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
|
|||||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -6,3 +6,7 @@
|
|||||||
path = vendor/postgres-v15
|
path = vendor/postgres-v15
|
||||||
url = https://github.com/neondatabase/postgres.git
|
url = https://github.com/neondatabase/postgres.git
|
||||||
branch = REL_15_STABLE_neon
|
branch = REL_15_STABLE_neon
|
||||||
|
[submodule "vendor/postgres-v16"]
|
||||||
|
path = vendor/postgres-v16
|
||||||
|
url = https://github.com/neondatabase/postgres.git
|
||||||
|
branch = REL_16_STABLE_neon
|
||||||
|
|||||||
97
Cargo.lock
generated
97
Cargo.lock
generated
@@ -158,6 +158,19 @@ dependencies = [
|
|||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-compression"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
|
||||||
|
dependencies = [
|
||||||
|
"flate2",
|
||||||
|
"futures-core",
|
||||||
|
"memchr",
|
||||||
|
"pin-project-lite",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-stream"
|
name = "async-stream"
|
||||||
version = "0.3.5"
|
version = "0.3.5"
|
||||||
@@ -593,7 +606,7 @@ dependencies = [
|
|||||||
"cc",
|
"cc",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"libc",
|
"libc",
|
||||||
"miniz_oxide",
|
"miniz_oxide 0.6.2",
|
||||||
"object",
|
"object",
|
||||||
"rustc-demangle",
|
"rustc-demangle",
|
||||||
]
|
]
|
||||||
@@ -882,9 +895,11 @@ name = "compute_tools"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-compression",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"compute_api",
|
"compute_api",
|
||||||
|
"flate2",
|
||||||
"futures",
|
"futures",
|
||||||
"hyper",
|
"hyper",
|
||||||
"notify",
|
"notify",
|
||||||
@@ -1367,6 +1382,16 @@ version = "0.4.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "flate2"
|
||||||
|
version = "1.0.26"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||||
|
dependencies = [
|
||||||
|
"crc32fast",
|
||||||
|
"miniz_oxide 0.7.1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fnv"
|
name = "fnv"
|
||||||
version = "1.0.7"
|
version = "1.0.7"
|
||||||
@@ -2151,6 +2176,15 @@ dependencies = [
|
|||||||
"adler",
|
"adler",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "miniz_oxide"
|
||||||
|
version = "0.7.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||||
|
dependencies = [
|
||||||
|
"adler",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mio"
|
name = "mio"
|
||||||
version = "0.8.6"
|
version = "0.8.6"
|
||||||
@@ -2345,9 +2379,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry"
|
name = "opentelemetry"
|
||||||
version = "0.18.0"
|
version = "0.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
|
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"opentelemetry_api",
|
"opentelemetry_api",
|
||||||
"opentelemetry_sdk",
|
"opentelemetry_sdk",
|
||||||
@@ -2355,9 +2389,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-http"
|
name = "opentelemetry-http"
|
||||||
version = "0.7.0"
|
version = "0.8.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
|
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -2368,9 +2402,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-otlp"
|
name = "opentelemetry-otlp"
|
||||||
version = "0.11.0"
|
version = "0.12.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
|
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"futures",
|
"futures",
|
||||||
@@ -2386,48 +2420,47 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-proto"
|
name = "opentelemetry-proto"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
|
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
"prost",
|
"prost",
|
||||||
"tonic 0.8.3",
|
"tonic 0.8.3",
|
||||||
"tonic-build 0.8.4",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-semantic-conventions"
|
name = "opentelemetry-semantic-conventions"
|
||||||
version = "0.10.0"
|
version = "0.11.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
|
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry_api"
|
name = "opentelemetry_api"
|
||||||
version = "0.18.0"
|
version = "0.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
|
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fnv",
|
"fnv",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"js-sys",
|
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
"urlencoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry_sdk"
|
name = "opentelemetry_sdk"
|
||||||
version = "0.18.0"
|
version = "0.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
|
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"crossbeam-channel",
|
"crossbeam-channel",
|
||||||
@@ -2482,6 +2515,7 @@ name = "pageserver"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-compression",
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -2498,6 +2532,7 @@ dependencies = [
|
|||||||
"enum-map",
|
"enum-map",
|
||||||
"enumset",
|
"enumset",
|
||||||
"fail",
|
"fail",
|
||||||
|
"flate2",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -2901,9 +2936,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.58"
|
version = "1.0.64"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
|
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
@@ -3292,9 +3327,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest-tracing"
|
name = "reqwest-tracing"
|
||||||
version = "0.4.4"
|
version = "0.4.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
|
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
@@ -3962,7 +3997,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tonic 0.9.2",
|
"tonic 0.9.2",
|
||||||
"tonic-build 0.9.2",
|
"tonic-build",
|
||||||
"tracing",
|
"tracing",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
@@ -4480,19 +4515,6 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tonic-build"
|
|
||||||
version = "0.8.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
|
|
||||||
dependencies = [
|
|
||||||
"prettyplease 0.1.25",
|
|
||||||
"proc-macro2",
|
|
||||||
"prost-build",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.109",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tonic-build"
|
name = "tonic-build"
|
||||||
version = "0.9.2"
|
version = "0.9.2"
|
||||||
@@ -4616,9 +4638,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing-opentelemetry"
|
name = "tracing-opentelemetry"
|
||||||
version = "0.18.0"
|
version = "0.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
|
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
@@ -4817,6 +4839,7 @@ dependencies = [
|
|||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"const_format",
|
||||||
"criterion",
|
"criterion",
|
||||||
"futures",
|
"futures",
|
||||||
"heapless",
|
"heapless",
|
||||||
|
|||||||
12
Cargo.toml
12
Cargo.toml
@@ -32,6 +32,8 @@ license = "Apache-2.0"
|
|||||||
## All dependency versions, used in the project
|
## All dependency versions, used in the project
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||||
|
flate2 = "1.0.26"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
|
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
|
||||||
@@ -82,9 +84,9 @@ notify = "5.0.0"
|
|||||||
num_cpus = "1.15"
|
num_cpus = "1.15"
|
||||||
num-traits = "0.2.15"
|
num-traits = "0.2.15"
|
||||||
once_cell = "1.13"
|
once_cell = "1.13"
|
||||||
opentelemetry = "0.18.0"
|
opentelemetry = "0.19.0"
|
||||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||||
opentelemetry-semantic-conventions = "0.10.0"
|
opentelemetry-semantic-conventions = "0.11.0"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
pbkdf2 = "0.12.1"
|
pbkdf2 = "0.12.1"
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
@@ -93,7 +95,7 @@ prost = "0.11"
|
|||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
regex = "1.4"
|
regex = "1.4"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||||
reqwest-middleware = "0.2.0"
|
reqwest-middleware = "0.2.0"
|
||||||
reqwest-retry = "0.2.2"
|
reqwest-retry = "0.2.2"
|
||||||
routerify = "3"
|
routerify = "3"
|
||||||
@@ -128,7 +130,7 @@ toml_edit = "0.19"
|
|||||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-error = "0.2.0"
|
tracing-error = "0.2.0"
|
||||||
tracing-opentelemetry = "0.18.0"
|
tracing-opentelemetry = "0.19.0"
|
||||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||||
|
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||||
COPY --chown=nonroot pgxn pgxn
|
COPY --chown=nonroot pgxn pgxn
|
||||||
COPY --chown=nonroot Makefile Makefile
|
COPY --chown=nonroot Makefile Makefile
|
||||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||||
@@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
|||||||
|
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||||
|
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||||
COPY --chown=nonroot . .
|
COPY --chown=nonroot . .
|
||||||
|
|
||||||
# Show build caching stats to check if it was used in the end.
|
# Show build caching stats to check if it was used in the end.
|
||||||
@@ -79,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
|
|||||||
|
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||||
|
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||||
|
|
||||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||||
|
|||||||
@@ -132,10 +132,20 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
|
|||||||
FROM build-deps AS h3-pg-build
|
FROM build-deps AS h3-pg-build
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
# packaged cmake is too old
|
RUN case "$(uname -m)" in \
|
||||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
|
"x86_64") \
|
||||||
|
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
|
||||||
|
;; \
|
||||||
|
"aarch64") \
|
||||||
|
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
|
||||||
|
;; \
|
||||||
|
*) \
|
||||||
|
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
|
||||||
|
;; \
|
||||||
|
esac && \
|
||||||
|
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
|
||||||
-q -O /tmp/cmake-install.sh \
|
-q -O /tmp/cmake-install.sh \
|
||||||
&& echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
|
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
|
||||||
&& chmod u+x /tmp/cmake-install.sh \
|
&& chmod u+x /tmp/cmake-install.sh \
|
||||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||||
&& rm /tmp/cmake-install.sh
|
&& rm /tmp/cmake-install.sh
|
||||||
|
|||||||
17
Makefile
17
Makefile
@@ -83,6 +83,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
|||||||
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
||||||
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
||||||
# versions is used.
|
# versions is used.
|
||||||
|
.PHONY: postgres-configure-v16
|
||||||
|
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
|
||||||
.PHONY: postgres-configure-v15
|
.PHONY: postgres-configure-v15
|
||||||
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
|
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
|
||||||
.PHONY: postgres-configure-v14
|
.PHONY: postgres-configure-v14
|
||||||
@@ -165,28 +167,33 @@ neon-pg-ext-clean-%:
|
|||||||
.PHONY: neon-pg-ext
|
.PHONY: neon-pg-ext
|
||||||
neon-pg-ext: \
|
neon-pg-ext: \
|
||||||
neon-pg-ext-v14 \
|
neon-pg-ext-v14 \
|
||||||
neon-pg-ext-v15
|
neon-pg-ext-v15 \
|
||||||
|
neon-pg-ext-v16
|
||||||
|
|
||||||
.PHONY: neon-pg-ext-clean
|
.PHONY: neon-pg-ext-clean
|
||||||
neon-pg-ext-clean: \
|
neon-pg-ext-clean: \
|
||||||
neon-pg-ext-clean-v14 \
|
neon-pg-ext-clean-v14 \
|
||||||
neon-pg-ext-clean-v15
|
neon-pg-ext-clean-v15 \
|
||||||
|
neon-pg-ext-clean-v16
|
||||||
|
|
||||||
# shorthand to build all Postgres versions
|
# shorthand to build all Postgres versions
|
||||||
.PHONY: postgres
|
.PHONY: postgres
|
||||||
postgres: \
|
postgres: \
|
||||||
postgres-v14 \
|
postgres-v14 \
|
||||||
postgres-v15
|
postgres-v15 \
|
||||||
|
postgres-v16
|
||||||
|
|
||||||
.PHONY: postgres-headers
|
.PHONY: postgres-headers
|
||||||
postgres-headers: \
|
postgres-headers: \
|
||||||
postgres-headers-v14 \
|
postgres-headers-v14 \
|
||||||
postgres-headers-v15
|
postgres-headers-v15 \
|
||||||
|
postgres-headers-v16
|
||||||
|
|
||||||
.PHONY: postgres-clean
|
.PHONY: postgres-clean
|
||||||
postgres-clean: \
|
postgres-clean: \
|
||||||
postgres-clean-v14 \
|
postgres-clean-v14 \
|
||||||
postgres-clean-v15
|
postgres-clean-v15 \
|
||||||
|
postgres-clean-v16
|
||||||
|
|
||||||
# This doesn't remove the effects of 'configure'.
|
# This doesn't remove the effects of 'configure'.
|
||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
async-compression.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
|
flate2.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
notify.workspace = true
|
notify.workspace = true
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
|
use std::io::BufRead;
|
||||||
use std::os::unix::fs::PermissionsExt;
|
use std::os::unix::fs::PermissionsExt;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
@@ -15,6 +16,7 @@ use utils::lsn::Lsn;
|
|||||||
|
|
||||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||||
|
use utils::measured_stream::MeasuredReader;
|
||||||
|
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
@@ -140,14 +142,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
.cluster
|
.cluster
|
||||||
.roles
|
.roles
|
||||||
.iter()
|
.iter()
|
||||||
.map(|r| format!("'{}'", escape_literal(&r.name)))
|
.map(|r| escape_literal(&r.name))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let dbs = spec
|
let dbs = spec
|
||||||
.cluster
|
.cluster
|
||||||
.databases
|
.databases
|
||||||
.iter()
|
.iter()
|
||||||
.map(|db| format!("'{}'", escape_literal(&db.name)))
|
.map(|db| escape_literal(&db.name))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let roles_decl = if roles.is_empty() {
|
let roles_decl = if roles.is_empty() {
|
||||||
@@ -253,20 +255,52 @@ impl ComputeNode {
|
|||||||
|
|
||||||
let mut client = config.connect(NoTls)?;
|
let mut client = config.connect(NoTls)?;
|
||||||
let basebackup_cmd = match lsn {
|
let basebackup_cmd = match lsn {
|
||||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
|
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||||
_ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
|
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||||
|
_ => format!(
|
||||||
|
"basebackup {} {} {} --gzip",
|
||||||
|
spec.tenant_id, spec.timeline_id, lsn
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||||
|
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||||
|
|
||||||
|
// Check the magic number to see if it's a gzip or not. Even though
|
||||||
|
// we might explicitly ask for gzip, an old pageserver with no implementation
|
||||||
|
// of gzip compression might send us uncompressed data. After some time
|
||||||
|
// passes we can assume all pageservers know how to compress and we can
|
||||||
|
// delete this check.
|
||||||
|
//
|
||||||
|
// If the data is not gzip, it will be tar. It will not be mistakenly
|
||||||
|
// recognized as gzip because tar starts with an ascii encoding of a filename,
|
||||||
|
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
|
||||||
|
// we send the "global" directory first from the pageserver, so it definitely
|
||||||
|
// won't be recognized as gzip.
|
||||||
|
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||||
|
let gzip = {
|
||||||
|
let peek = bufreader.fill_buf().unwrap();
|
||||||
|
peek[0] == 0x1f && peek[1] == 0x8b
|
||||||
|
};
|
||||||
|
|
||||||
// Read the archive directly from the `CopyOutReader`
|
// Read the archive directly from the `CopyOutReader`
|
||||||
//
|
//
|
||||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||||
// sends an Error after finishing the tarball, we will not notice it.
|
// sends an Error after finishing the tarball, we will not notice it.
|
||||||
let mut ar = tar::Archive::new(copyreader);
|
if gzip {
|
||||||
ar.set_ignore_zeros(true);
|
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||||
ar.unpack(&self.pgdata)?;
|
ar.set_ignore_zeros(true);
|
||||||
|
ar.unpack(&self.pgdata)?;
|
||||||
|
} else {
|
||||||
|
let mut ar = tar::Archive::new(&mut bufreader);
|
||||||
|
ar.set_ignore_zeros(true);
|
||||||
|
ar.unpack(&self.pgdata)?;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Report metrics
|
||||||
|
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||||
|
measured_reader.get_byte_count() as u64;
|
||||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||||
.signed_duration_since(start_time)
|
.signed_duration_since(start_time)
|
||||||
.to_std()
|
.to_std()
|
||||||
@@ -549,6 +583,13 @@ impl ComputeNode {
|
|||||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Log metrics so that we can search for slow operations in logs
|
||||||
|
let metrics = {
|
||||||
|
let state = self.state.lock().unwrap();
|
||||||
|
state.metrics.clone()
|
||||||
|
};
|
||||||
|
info!(?metrics, "compute start finished");
|
||||||
|
|
||||||
Ok(pg)
|
Ok(pg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -47,30 +47,22 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
|||||||
// Add options for connecting to storage
|
// Add options for connecting to storage
|
||||||
writeln!(file, "# Neon storage settings")?;
|
writeln!(file, "# Neon storage settings")?;
|
||||||
if let Some(s) = &spec.pageserver_connstring {
|
if let Some(s) = &spec.pageserver_connstring {
|
||||||
writeln!(
|
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||||
file,
|
|
||||||
"neon.pageserver_connstring='{}'",
|
|
||||||
escape_conf_value(s)
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
if !spec.safekeeper_connstrings.is_empty() {
|
if !spec.safekeeper_connstrings.is_empty() {
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
"neon.safekeepers='{}'",
|
"neon.safekeepers={}",
|
||||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
if let Some(s) = &spec.tenant_id {
|
if let Some(s) = &spec.tenant_id {
|
||||||
writeln!(
|
writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
|
||||||
file,
|
|
||||||
"neon.tenant_id='{}'",
|
|
||||||
escape_conf_value(&s.to_string())
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
if let Some(s) = &spec.timeline_id {
|
if let Some(s) = &spec.timeline_id {
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
"neon.timeline_id='{}'",
|
"neon.timeline_id={}",
|
||||||
escape_conf_value(&s.to_string())
|
escape_conf_value(&s.to_string())
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,15 +16,26 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
|||||||
|
|
||||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||||
|
|
||||||
/// Escape a string for including it in a SQL literal
|
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||||
|
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||||
|
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||||
|
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||||
|
/// for the original implementation.
|
||||||
pub fn escape_literal(s: &str) -> String {
|
pub fn escape_literal(s: &str) -> String {
|
||||||
s.replace('\'', "''").replace('\\', "\\\\")
|
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||||
|
|
||||||
|
if res.contains('\\') {
|
||||||
|
format!("E'{}'", res)
|
||||||
|
} else {
|
||||||
|
format!("'{}'", res)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Escape a string so that it can be used in postgresql.conf.
|
/// Escape a string so that it can be used in postgresql.conf. Wrapping the result
|
||||||
/// Same as escape_literal, currently.
|
/// with `'{}'` is not required, as it returns a ready-to-use config string.
|
||||||
pub fn escape_conf_value(s: &str) -> String {
|
pub fn escape_conf_value(s: &str) -> String {
|
||||||
s.replace('\'', "''").replace('\\', "\\\\")
|
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||||
|
format!("'{}'", res)
|
||||||
}
|
}
|
||||||
|
|
||||||
trait GenericOptionExt {
|
trait GenericOptionExt {
|
||||||
@@ -37,7 +48,7 @@ impl GenericOptionExt for GenericOption {
|
|||||||
fn to_pg_option(&self) -> String {
|
fn to_pg_option(&self) -> String {
|
||||||
if let Some(val) = &self.value {
|
if let Some(val) = &self.value {
|
||||||
match self.vartype.as_ref() {
|
match self.vartype.as_ref() {
|
||||||
"string" => format!("{} '{}'", self.name, escape_literal(val)),
|
"string" => format!("{} {}", self.name, escape_literal(val)),
|
||||||
_ => format!("{} {}", self.name, val),
|
_ => format!("{} {}", self.name, val),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -49,7 +60,7 @@ impl GenericOptionExt for GenericOption {
|
|||||||
fn to_pg_setting(&self) -> String {
|
fn to_pg_setting(&self) -> String {
|
||||||
if let Some(val) = &self.value {
|
if let Some(val) = &self.value {
|
||||||
match self.vartype.as_ref() {
|
match self.vartype.as_ref() {
|
||||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
"string" => format!("{} = {}", self.name, escape_conf_value(val)),
|
||||||
_ => format!("{} = {}", self.name, val),
|
_ => format!("{} = {}", self.name, val),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -397,10 +397,44 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
// We do not check either DB exists or not,
|
// We do not check either DB exists or not,
|
||||||
// Postgres will take care of it for us
|
// Postgres will take care of it for us
|
||||||
"delete_db" => {
|
"delete_db" => {
|
||||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
|
// In Postgres we can't drop a database if it is a template.
|
||||||
|
// So we need to unset the template flag first, but it could
|
||||||
|
// be a retry, so we could've already dropped the database.
|
||||||
|
// Check that database exists first to make it idempotent.
|
||||||
|
let unset_template_query: String = format!(
|
||||||
|
"
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS(
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_catalog.pg_database
|
||||||
|
WHERE datname = {}
|
||||||
|
)
|
||||||
|
THEN
|
||||||
|
ALTER DATABASE {} is_template false;
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;",
|
||||||
|
escape_literal(&op.name),
|
||||||
|
&op.name.pg_quote()
|
||||||
|
);
|
||||||
|
// Use FORCE to drop database even if there are active connections.
|
||||||
|
// We run this from `cloud_admin`, so it should have enough privileges.
|
||||||
|
// NB: there could be other db states, which prevent us from dropping
|
||||||
|
// the database. For example, if db is used by any active subscription
|
||||||
|
// or replication slot.
|
||||||
|
// TODO: deal with it once we allow logical replication. Proper fix should
|
||||||
|
// involve returning an error code to the control plane, so it could
|
||||||
|
// figure out that this is a non-retryable error, return it to the user
|
||||||
|
// and fail operation permanently.
|
||||||
|
let drop_db_query: String = format!(
|
||||||
|
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
|
||||||
|
&op.name.pg_quote()
|
||||||
|
);
|
||||||
|
|
||||||
warn!("deleting database '{}'", &op.name);
|
warn!("deleting database '{}'", &op.name);
|
||||||
client.execute(query.as_str(), &[])?;
|
client.execute(unset_template_query.as_str(), &[])?;
|
||||||
|
client.execute(drop_db_query.as_str(), &[])?;
|
||||||
}
|
}
|
||||||
"rename_db" => {
|
"rename_db" => {
|
||||||
let new_name = op.new_name.as_ref().unwrap();
|
let new_name = op.new_name.as_ref().unwrap();
|
||||||
|
|||||||
@@ -89,4 +89,12 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
|||||||
assert_eq!(none_generic_options.find("missed_value"), None);
|
assert_eq!(none_generic_options.find("missed_value"), None);
|
||||||
assert_eq!(none_generic_options.find("invalid_value"), None);
|
assert_eq!(none_generic_options.find("invalid_value"), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_escape_literal() {
|
||||||
|
assert_eq!(escape_literal("test"), "'test'");
|
||||||
|
assert_eq!(escape_literal("test'"), "'test'''");
|
||||||
|
assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
|
||||||
|
assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||||
//! The pid stored in the file is later used to stop the service.
|
//! The pid stored in the file is later used to stop the service.
|
||||||
//!
|
//!
|
||||||
//! See [`lock_file`] module for more info.
|
//! See the [`lock_file`](utils::lock_file) module for more info.
|
||||||
|
|
||||||
use std::ffi::OsStr;
|
use std::ffi::OsStr;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|||||||
@@ -2,8 +2,9 @@
|
|||||||
//!
|
//!
|
||||||
//! In the local test environment, the data for each safekeeper is stored in
|
//! In the local test environment, the data for each safekeeper is stored in
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! .neon/safekeepers/<safekeeper id>
|
//! .neon/safekeepers/<safekeeper id>
|
||||||
//!
|
//! ```
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|||||||
@@ -2,7 +2,9 @@
|
|||||||
//!
|
//!
|
||||||
//! In the local test environment, the data for each endpoint is stored in
|
//! In the local test environment, the data for each endpoint is stored in
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! .neon/endpoints/<endpoint id>
|
//! .neon/endpoints/<endpoint id>
|
||||||
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
||||||
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
||||||
@@ -22,7 +24,7 @@
|
|||||||
//!
|
//!
|
||||||
//! Directory contents:
|
//! Directory contents:
|
||||||
//!
|
//!
|
||||||
//! ```ignore
|
//! ```text
|
||||||
//! .neon/endpoints/main/
|
//! .neon/endpoints/main/
|
||||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||||
//! endpoint.json - serialized `EndpointConf` struct
|
//! endpoint.json - serialized `EndpointConf` struct
|
||||||
|
|||||||
@@ -169,6 +169,7 @@ impl LocalEnv {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
|
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -177,6 +178,7 @@ impl LocalEnv {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
|
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -184,6 +186,7 @@ impl LocalEnv {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
|
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,8 +2,9 @@
|
|||||||
//!
|
//!
|
||||||
//! In the local test environment, the data for each safekeeper is stored in
|
//! In the local test environment, the data for each safekeeper is stored in
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! .neon/safekeepers/<safekeeper id>
|
//! .neon/safekeepers/<safekeeper id>
|
||||||
//!
|
//! ```
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::Child;
|
use std::process::Child;
|
||||||
|
|||||||
84
docs/rfcs/024-user-mgmt.md
Normal file
84
docs/rfcs/024-user-mgmt.md
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# Postgres user and database management
|
||||||
|
|
||||||
|
(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
|
||||||
|
|
||||||
|
We've accumulated a bunch of problems with our approach to role and database management, namely:
|
||||||
|
|
||||||
|
1. we don't allow role and database creation from Postgres, and users are complaining about that
|
||||||
|
2. fine-grained role management is not possible both from Postgres and console
|
||||||
|
|
||||||
|
Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
|
||||||
|
that:
|
||||||
|
|
||||||
|
* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
|
||||||
|
malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
|
||||||
|
* it is handy when we can render console UI without waking up compute (e.g., show database list)
|
||||||
|
|
||||||
|
This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
|
||||||
|
* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
|
||||||
|
|
||||||
|
## Postgres behavior
|
||||||
|
|
||||||
|
The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
|
||||||
|
to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
|
||||||
|
|
||||||
|
Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
|
||||||
|
* roles created via SQL will *not* have raw passwords in the console
|
||||||
|
* roles created via SQL will have raw passwords in the console, except ones that were created using hashes
|
||||||
|
|
||||||
|
I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
|
||||||
|
|
||||||
|
To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
|
||||||
|
|
||||||
|
* Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
|
||||||
|
* When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
|
||||||
|
* Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
|
||||||
|
* We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
|
||||||
|
|
||||||
|
## Console user management API
|
||||||
|
|
||||||
|
The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl -X PATCH /api/v1/roles_and_databases -d '
|
||||||
|
[
|
||||||
|
{"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
|
||||||
|
{"op":"drop", "type":"role", "name": "trout"},
|
||||||
|
{"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
|
||||||
|
{"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
|
||||||
|
]
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
Makes sense not to error out on duplicated create/delete operations (see failure modes)
|
||||||
|
|
||||||
|
## Managing users from the console
|
||||||
|
|
||||||
|
Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
|
||||||
|
|
||||||
|
Generally, we have several options when we are creating users via console:
|
||||||
|
|
||||||
|
1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
|
||||||
|
2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
|
||||||
|
3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
|
||||||
|
4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
|
||||||
|
|
||||||
|
The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
|
||||||
|
|
||||||
|
1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
|
||||||
|
2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
|
||||||
|
|
||||||
|
## Failure modes
|
||||||
|
|
||||||
|
* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
|
||||||
|
|
||||||
|
in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
|
||||||
|
|
||||||
|
|
||||||
|
# Scalability
|
||||||
|
|
||||||
|
On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).
|
||||||
22
docs/tools.md
Normal file
22
docs/tools.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Useful development tools
|
||||||
|
|
||||||
|
This readme contains some hints on how to set up some optional development tools.
|
||||||
|
|
||||||
|
## ccls
|
||||||
|
|
||||||
|
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
|
||||||
|
to work well. There are different ways to do it but here's what works for me:
|
||||||
|
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
|
||||||
|
2. Go to `vendor/postgres-v15`
|
||||||
|
3. Run `make clean && ./configure`
|
||||||
|
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
|
||||||
|
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
|
||||||
|
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
|
||||||
|
|
||||||
|
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
|
||||||
|
|
||||||
|
Some additional tips for various IDEs:
|
||||||
|
|
||||||
|
### Emacs
|
||||||
|
|
||||||
|
To improve performance: `(setq lsp-lens-enable nil)`
|
||||||
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
|
|||||||
pub wait_for_spec_ms: u64,
|
pub wait_for_spec_ms: u64,
|
||||||
pub sync_safekeepers_ms: u64,
|
pub sync_safekeepers_ms: u64,
|
||||||
pub basebackup_ms: u64,
|
pub basebackup_ms: u64,
|
||||||
|
pub basebackup_bytes: u64,
|
||||||
pub start_postgres_ms: u64,
|
pub start_postgres_ms: u64,
|
||||||
pub config_ms: u64,
|
pub config_ms: u64,
|
||||||
pub total_startup_ms: u64,
|
pub total_startup_ms: u64,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||||
|
|
||||||
use std::{future::Future, time::Instant};
|
use std::{future::Future, time::Instant};
|
||||||
|
|
||||||
|
|||||||
@@ -411,12 +411,16 @@ pub struct LayerResidenceEvent {
|
|||||||
pub reason: LayerResidenceEventReason,
|
pub reason: LayerResidenceEventReason,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The reason for recording a given [`ResidenceEvent`].
|
/// The reason for recording a given [`LayerResidenceEvent`].
|
||||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||||
pub enum LayerResidenceEventReason {
|
pub enum LayerResidenceEventReason {
|
||||||
/// The layer map is being populated, e.g. during timeline load or attach.
|
/// The layer map is being populated, e.g. during timeline load or attach.
|
||||||
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
||||||
/// We need to record such events because there is no persistent storage for the events.
|
/// We need to record such events because there is no persistent storage for the events.
|
||||||
|
///
|
||||||
|
// https://github.com/rust-lang/rust/issues/74481
|
||||||
|
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
|
||||||
|
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
|
||||||
LayerLoad,
|
LayerLoad,
|
||||||
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
||||||
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
||||||
|
|||||||
@@ -60,8 +60,9 @@ impl Ord for RelTag {
|
|||||||
|
|
||||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||||
///
|
///
|
||||||
|
/// ```text
|
||||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||||
///
|
/// ```
|
||||||
impl fmt::Display for RelTag {
|
impl fmt::Display for RelTag {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
PathBuf::from("pg_install")
|
PathBuf::from("pg_install")
|
||||||
};
|
};
|
||||||
|
|
||||||
for pg_version in &["v14", "v15"] {
|
for pg_version in &["v14", "v15", "v16"] {
|
||||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
||||||
if pg_install_dir_versioned.is_relative() {
|
if pg_install_dir_versioned.is_relative() {
|
||||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ macro_rules! for_all_postgres_versions {
|
|||||||
($macro:tt) => {
|
($macro:tt) => {
|
||||||
$macro!(v14);
|
$macro!(v14);
|
||||||
$macro!(v15);
|
$macro!(v15);
|
||||||
|
$macro!(v16);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,9 +93,10 @@ pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
|||||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||||
match version {
|
match version {
|
||||||
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
|
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
|
||||||
15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
15 | 16 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|
||||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
|
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
|
||||||
|
|
||||||
_ => anyhow::bail!("Unknown version {}", version),
|
_ => anyhow::bail!("Unknown version {}", version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -110,6 +112,7 @@ pub fn generate_wal_segment(
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||||
|
16 => v16::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||||
_ => Err(SerializeError::BadInput),
|
_ => Err(SerializeError::BadInput),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -123,6 +126,7 @@ pub fn generate_pg_control(
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||||
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||||
|
16 => v16::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||||
_ => anyhow::bail!("Unknown version {}", pg_version),
|
_ => anyhow::bail!("Unknown version {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -197,7 +201,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
|
|||||||
|
|
||||||
pub mod waldecoder {
|
pub mod waldecoder {
|
||||||
|
|
||||||
use crate::{v14, v15};
|
use crate::{v14, v15, v16};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
@@ -259,6 +263,10 @@ pub mod waldecoder {
|
|||||||
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
|
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
|
||||||
self.poll_decode_internal()
|
self.poll_decode_internal()
|
||||||
}
|
}
|
||||||
|
16 => {
|
||||||
|
use self::v16::waldecoder_handler::WalStreamDecoderHandler;
|
||||||
|
self.poll_decode_internal()
|
||||||
|
}
|
||||||
_ => Err(WalDecodeError {
|
_ => Err(WalDecodeError {
|
||||||
msg: format!("Unknown version {}", self.pg_version),
|
msg: format!("Unknown version {}", self.pg_version),
|
||||||
lsn: self.lsn,
|
lsn: self.lsn,
|
||||||
|
|||||||
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
|
|||||||
// Multixact utils
|
// Multixact utils
|
||||||
|
|
||||||
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
|
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
|
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
|
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||||
|
|||||||
1
libs/postgres_ffi/src/pg_constants_v16.rs
Normal file
1
libs/postgres_ffi/src/pg_constants_v16.rs
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -49,14 +49,16 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
|
||||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||||
///
|
///
|
||||||
/// Formats:
|
/// Formats:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
/// <oid>
|
/// <oid>
|
||||||
/// <oid>_<fork name>
|
/// <oid>_<fork name>
|
||||||
/// <oid>.<segment number>
|
/// <oid>.<segment number>
|
||||||
/// <oid>_<fork name>.<segment number>
|
/// <oid>_<fork name>.<segment number>
|
||||||
|
/// ```
|
||||||
///
|
///
|
||||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ impl Conf {
|
|||||||
match self.pg_version {
|
match self.pg_version {
|
||||||
14 => Ok(path.join(format!("v{}", self.pg_version))),
|
14 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||||
15 => Ok(path.join(format!("v{}", self.pg_version))),
|
15 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||||
|
16 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||||
_ => bail!("Unsupported postgres version: {}", self.pg_version),
|
_ => bail!("Unsupported postgres version: {}", self.pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,11 +5,11 @@
|
|||||||
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
||||||
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
||||||
//! separately without using split from futures::stream::StreamExt (which
|
//! separately without using split from futures::stream::StreamExt (which
|
||||||
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
|
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
|
||||||
//! instead. Plus we customize error messages more than a single type for all io
|
//! instead. Plus we customize error messages more than a single type for all io
|
||||||
//! calls.
|
//! calls.
|
||||||
//!
|
//!
|
||||||
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||||
use bytes::{Buf, BytesMut};
|
use bytes::{Buf, BytesMut};
|
||||||
use std::{
|
use std::{
|
||||||
future::Future,
|
future::Future,
|
||||||
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
|
|||||||
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||||
/// Split into owned read and write parts. Beware of potential issues with
|
/// Split into owned read and write parts. Beware of potential issues with
|
||||||
/// using halves in different tasks on TLS stream:
|
/// using halves in different tasks on TLS stream:
|
||||||
/// https://github.com/tokio-rs/tls/issues/40
|
/// <https://github.com/tokio-rs/tls/issues/40>
|
||||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||||
let reader = FramedReader {
|
let reader = FramedReader {
|
||||||
|
|||||||
@@ -934,6 +934,15 @@ impl<'a> BeMessage<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||||
|
let mut terminated = [0; 6];
|
||||||
|
for (i, &elem) in code.iter().enumerate() {
|
||||||
|
terminated[i] = elem;
|
||||||
|
}
|
||||||
|
|
||||||
|
terminated
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -965,12 +974,3 @@ mod tests {
|
|||||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
|
||||||
let mut terminated = [0; 6];
|
|
||||||
for (i, &elem) in code.iter().enumerate() {
|
|
||||||
terminated[i] = elem;
|
|
||||||
}
|
|
||||||
|
|
||||||
terminated
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
|||||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||||
/// ~200 RPS for IAM services
|
/// ~200 RPS for IAM services
|
||||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||||
|
|
||||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||||
@@ -50,6 +50,12 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct RemotePath(PathBuf);
|
pub struct RemotePath(PathBuf);
|
||||||
|
|
||||||
|
impl std::fmt::Display for RemotePath {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self.0.display())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl RemotePath {
|
impl RemotePath {
|
||||||
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
future::Future,
|
future::Future,
|
||||||
|
io::ErrorKind,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
};
|
};
|
||||||
@@ -150,10 +151,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
let mut files = vec![];
|
let mut files = vec![];
|
||||||
let mut directory_queue = vec![full_path.clone()];
|
let mut directory_queue = vec![full_path.clone()];
|
||||||
|
|
||||||
while !directory_queue.is_empty() {
|
while let Some(cur_folder) = directory_queue.pop() {
|
||||||
let cur_folder = directory_queue
|
|
||||||
.pop()
|
|
||||||
.expect("queue cannot be empty: we just checked");
|
|
||||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let file_name: PathBuf = entry.file_name().into();
|
let file_name: PathBuf = entry.file_name().into();
|
||||||
@@ -343,18 +341,14 @@ impl RemoteStorage for LocalFs {
|
|||||||
|
|
||||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||||
let file_path = path.with_base(&self.storage_root);
|
let file_path = path.with_base(&self.storage_root);
|
||||||
if !file_path.exists() {
|
match fs::remove_file(&file_path).await {
|
||||||
|
Ok(()) => Ok(()),
|
||||||
|
// The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
|
||||||
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
||||||
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
||||||
return Ok(());
|
Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
|
||||||
|
Err(e) => Err(anyhow::anyhow!(e)),
|
||||||
}
|
}
|
||||||
|
|
||||||
if !file_path.is_file() {
|
|
||||||
anyhow::bail!("{file_path:?} is not a file");
|
|
||||||
}
|
|
||||||
Ok(fs::remove_file(file_path)
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow::anyhow!(e))?)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
|||||||
// 2. D+C+a+b
|
// 2. D+C+a+b
|
||||||
// 3. D+A+B
|
// 3. D+A+B
|
||||||
|
|
||||||
/// [`Segment`] which has had it's size calculated.
|
/// `Segment` which has had its size calculated.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct SegmentSize {
|
struct SegmentSize {
|
||||||
method: SegmentMethod,
|
method: SegmentMethod,
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
|
|||||||
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
||||||
/// so I had to write our own. OpenTelemetry website has a registry of
|
/// so I had to write our own. OpenTelemetry website has a registry of
|
||||||
/// instrumentation libraries at:
|
/// instrumentation libraries at:
|
||||||
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
|
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
|
||||||
/// If a Hyper crate appears, consider switching to that.
|
/// If a Hyper crate appears, consider switching to that.
|
||||||
pub async fn tracing_handler<F, R>(
|
pub async fn tracing_handler<F, R>(
|
||||||
req: Request<Body>,
|
req: Request<Body>,
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ pq_proto.workspace = true
|
|||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
|
const_format.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
|||||||
.map_err(ApiError::BadRequest)
|
.map_err(ApiError::BadRequest)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
|
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||||
request: &mut Request<Body>,
|
request: &mut Request<Body>,
|
||||||
) -> Result<Option<T>, ApiError> {
|
) -> Result<Option<T>, ApiError> {
|
||||||
|
|||||||
@@ -109,10 +109,16 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
|||||||
/// * building in docker (either in CI or locally)
|
/// * building in docker (either in CI or locally)
|
||||||
///
|
///
|
||||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
/// When building locally, the `git_version` is used to query .git. When building on CI and docker,
|
||||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to
|
||||||
/// Git version received from environment variable used as a fallback in git_version invocation.
|
/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION
|
||||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
/// environment variable.
|
||||||
|
///
|
||||||
|
/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
|
||||||
|
/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
|
||||||
|
/// anytime) in #4641.
|
||||||
|
///
|
||||||
|
/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||||
///
|
///
|
||||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||||
@@ -124,25 +130,36 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
|||||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||||
///
|
///
|
||||||
/// #############################################################################################
|
/// #############################################################################################
|
||||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
|
||||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||||
#[macro_export]
|
#[macro_export]
|
||||||
macro_rules! project_git_version {
|
macro_rules! project_git_version {
|
||||||
($const_identifier:ident) => {
|
($const_identifier:ident) => {
|
||||||
const $const_identifier: &str = git_version::git_version!(
|
// this should try GIT_VERSION first only then git_version::git_version!
|
||||||
prefix = "git:",
|
const $const_identifier: &::core::primitive::str = {
|
||||||
fallback = concat!(
|
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
|
||||||
"git-env:",
|
prefix = "",
|
||||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
fallback = "unknown",
|
||||||
),
|
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
};
|
||||||
);
|
|
||||||
|
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
|
||||||
|
::core::option::Option::Some(x) => ["git-env:", x],
|
||||||
|
::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
|
||||||
|
};
|
||||||
|
|
||||||
|
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Re-export for `project_git_version` macro
|
||||||
|
#[doc(hidden)]
|
||||||
|
pub use const_format as __const_format;
|
||||||
|
|
||||||
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
||||||
#[macro_export]
|
#[macro_export]
|
||||||
macro_rules! const_assert {
|
macro_rules! const_assert {
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
//! A module to create and read lock files.
|
//! A module to create and read lock files.
|
||||||
//!
|
//!
|
||||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||||
//! The only consumer of this module is currently [`pid_file`].
|
//! The only consumer of this module is currently
|
||||||
//! See the module-level comment there for potential pitfalls
|
//! [`pid_file`](crate::pid_file). See the module-level comment
|
||||||
//! with lock files that are used to store PIDs (pidfiles).
|
//! there for potential pitfalls with lock files that are used
|
||||||
|
//! to store PIDs (pidfiles).
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
fs,
|
fs,
|
||||||
@@ -81,7 +82,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returned by [`read_and_hold_lock_file`].
|
/// Returned by [`read_and_hold_lock_file`].
|
||||||
/// Check out the [`pid_file`] module for what the variants mean
|
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
|
||||||
/// and potential caveats if the lock files that are used to store PIDs.
|
/// and potential caveats if the lock files that are used to store PIDs.
|
||||||
pub enum LockFileRead {
|
pub enum LockFileRead {
|
||||||
/// No file exists at the given path.
|
/// No file exists at the given path.
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ pub fn init(
|
|||||||
///
|
///
|
||||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||||
/// If the assumptions about the initialization order are not held, use
|
/// If the assumptions about the initialization order are not held, use
|
||||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
|
||||||
/// lost.
|
/// lost.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use pin_project_lite::pin_project;
|
use pin_project_lite::pin_project;
|
||||||
|
use std::io::Read;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::{io, task};
|
use std::{io, task};
|
||||||
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||||
@@ -75,3 +76,34 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
|
|||||||
self.project().stream.poll_shutdown(context)
|
self.project().stream.poll_shutdown(context)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wrapper for a reader that counts bytes read.
|
||||||
|
///
|
||||||
|
/// Similar to MeasuredStream but it's one way and it's sync
|
||||||
|
pub struct MeasuredReader<R: Read> {
|
||||||
|
inner: R,
|
||||||
|
byte_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: Read> MeasuredReader<R> {
|
||||||
|
pub fn new(reader: R) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: reader,
|
||||||
|
byte_count: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_byte_count(&self) -> usize {
|
||||||
|
self.byte_count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: Read> Read for MeasuredReader<R> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||||
|
let result = self.inner.read(buf);
|
||||||
|
if let Ok(n_bytes) = result {
|
||||||
|
self.byte_count += n_bytes
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ pub enum SeqWaitError {
|
|||||||
|
|
||||||
/// Monotonically increasing value
|
/// Monotonically increasing value
|
||||||
///
|
///
|
||||||
/// It is handy to store some other fields under the same mutex in SeqWait<S>
|
/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
|
||||||
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
|
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
|
||||||
/// any type that can expose counter. <V> is the type of exposed counter.
|
/// any type that can expose counter. `V` is the type of exposed counter.
|
||||||
pub trait MonotonicCounter<V> {
|
pub trait MonotonicCounter<V> {
|
||||||
/// Bump counter value and check that it goes forward
|
/// Bump counter value and check that it goes forward
|
||||||
/// N.B.: new_val is an actual new value, not a difference.
|
/// N.B.: new_val is an actual new value, not a difference.
|
||||||
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
|
|||||||
/// [`wait_for`]: SeqWait::wait_for
|
/// [`wait_for`]: SeqWait::wait_for
|
||||||
/// [`advance`]: SeqWait::advance
|
/// [`advance`]: SeqWait::advance
|
||||||
///
|
///
|
||||||
/// <S> means Storage, <V> is type of counter that this storage exposes.
|
/// `S` means Storage, `V` is type of counter that this storage exposes.
|
||||||
///
|
///
|
||||||
pub struct SeqWait<S, V>
|
pub struct SeqWait<S, V>
|
||||||
where
|
where
|
||||||
|
|||||||
@@ -1,8 +1,15 @@
|
|||||||
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
||||||
//!
|
//!
|
||||||
|
//! Can only produce meaningful positive results when tracing has been configured as in example.
|
||||||
|
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
|
||||||
|
//!
|
||||||
|
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
|
||||||
|
//! is completly unconfigured.
|
||||||
|
//!
|
||||||
//! # Usage
|
//! # Usage
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```rust
|
||||||
|
//! # fn main() {
|
||||||
//! use tracing_subscriber::prelude::*;
|
//! use tracing_subscriber::prelude::*;
|
||||||
//! let registry = tracing_subscriber::registry()
|
//! let registry = tracing_subscriber::registry()
|
||||||
//! .with(tracing_error::ErrorLayer::default());
|
//! .with(tracing_error::ErrorLayer::default());
|
||||||
@@ -20,23 +27,18 @@
|
|||||||
//!
|
//!
|
||||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||||
//! match check_fields_present([&extractor]) {
|
//! if let Err(missing) = check_fields_present!([&extractor]) {
|
||||||
//! Ok(()) => {},
|
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
|
||||||
//! Err(missing) => {
|
//! // to get the "user" code location for the panic.
|
||||||
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
|
//! panic!("Missing fields: {missing:?}");
|
||||||
//! }
|
|
||||||
//! }
|
//! }
|
||||||
|
//! # }
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
|
||||||
//!
|
//!
|
||||||
|
|
||||||
use std::{
|
#[derive(Debug)]
|
||||||
collections::HashSet,
|
|
||||||
fmt::{self},
|
|
||||||
hash::{Hash, Hasher},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub enum ExtractionResult {
|
pub enum ExtractionResult {
|
||||||
Present,
|
Present,
|
||||||
Absent,
|
Absent,
|
||||||
@@ -71,51 +73,105 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||||
|
///
|
||||||
impl<'a> MemoryIdentity<'a> {
|
/// This should not be called directly, but used through [`check_fields_present`] which allows
|
||||||
fn as_ptr(&self) -> *const () {
|
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
|
||||||
self.0 as *const _ as *const ()
|
#[doc(hidden)]
|
||||||
}
|
pub fn check_fields_present0<const L: usize>(
|
||||||
}
|
|
||||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
|
||||||
fn eq(&self, other: &Self) -> bool {
|
|
||||||
self.as_ptr() == other.as_ptr()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
|
||||||
impl<'a> Hash for MemoryIdentity<'a> {
|
|
||||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
|
||||||
self.as_ptr().hash(state);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The extractor names passed as keys to [`new`].
|
|
||||||
pub fn check_fields_present<const L: usize>(
|
|
||||||
must_be_present: [&dyn Extractor; L],
|
must_be_present: [&dyn Extractor; L],
|
||||||
) -> Result<(), Vec<&dyn Extractor>> {
|
) -> Result<Summary, Vec<&dyn Extractor>> {
|
||||||
let mut missing: HashSet<MemoryIdentity> =
|
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
|
||||||
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
|
|
||||||
let trace = tracing_error::SpanTrace::capture();
|
let trace = tracing_error::SpanTrace::capture();
|
||||||
trace.with_spans(|md, _formatted_fields| {
|
trace.with_spans(|md, _formatted_fields| {
|
||||||
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
|
// when trying to understand the inner workings of how does the matching work, note that
|
||||||
|
// this closure might be called zero times if the span is disabled. normally it is called
|
||||||
|
// once per span hierarchy level.
|
||||||
|
missing.retain(|extractor| match extractor.extract(md.fields()) {
|
||||||
ExtractionResult::Present => false,
|
ExtractionResult::Present => false,
|
||||||
ExtractionResult::Absent => true,
|
ExtractionResult::Absent => true,
|
||||||
});
|
});
|
||||||
!missing.is_empty() // continue walking up until we've found all missing
|
|
||||||
|
// continue walking up until we've found all missing
|
||||||
|
!missing.is_empty()
|
||||||
});
|
});
|
||||||
if missing.is_empty() {
|
if missing.is_empty() {
|
||||||
Ok(())
|
Ok(Summary::FoundEverything)
|
||||||
|
} else if !tracing_subscriber_configured() {
|
||||||
|
Ok(Summary::Unconfigured)
|
||||||
} else {
|
} else {
|
||||||
Err(missing.into_iter().map(|mi| mi.0).collect())
|
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
|
||||||
|
// missing, which can be annoying. for this case, we could probably use
|
||||||
|
// SpanTrace::status().
|
||||||
|
//
|
||||||
|
// another way to end up here is with RUST_LOG=pageserver=off while configuring the
|
||||||
|
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
|
||||||
|
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
|
||||||
|
Err(missing)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||||
|
///
|
||||||
|
/// The macro is the preferred way of checking if fields exist while passing checks if a test does
|
||||||
|
/// not have tracing configured.
|
||||||
|
///
|
||||||
|
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
|
||||||
|
/// However we can game a module namespaced macro for `use` purposes by re-exporting the
|
||||||
|
/// #[macro_export] exported name with an alias (below).
|
||||||
|
#[doc(hidden)]
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! __check_fields_present {
|
||||||
|
($extractors:expr) => {{
|
||||||
|
{
|
||||||
|
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
|
||||||
|
|
||||||
|
match check_fields_present0($extractors) {
|
||||||
|
Ok(FoundEverything) => Ok(()),
|
||||||
|
Ok(Unconfigured) if cfg!(test) => {
|
||||||
|
// allow unconfigured in tests
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
Ok(Unconfigured) => {
|
||||||
|
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
|
||||||
|
},
|
||||||
|
Err(missing) => Err(missing)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub use crate::__check_fields_present as check_fields_present;
|
||||||
|
|
||||||
|
/// Explanation for why the check was deemed ok.
|
||||||
|
///
|
||||||
|
/// Mainly useful for testing, or configuring per-crate behaviour as in with
|
||||||
|
/// [`check_fields_present`].
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum Summary {
|
||||||
|
/// All extractors were found.
|
||||||
|
///
|
||||||
|
/// Should only happen when tracing is properly configured.
|
||||||
|
FoundEverything,
|
||||||
|
|
||||||
|
/// Tracing has not been configured at all. This is ok for tests running without tracing set
|
||||||
|
/// up.
|
||||||
|
Unconfigured,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tracing_subscriber_configured() -> bool {
|
||||||
|
let mut noop_configured = false;
|
||||||
|
tracing::dispatcher::get_default(|d| {
|
||||||
|
// it is possible that this closure will not be invoked, but the current implementation
|
||||||
|
// always invokes it
|
||||||
|
noop_configured = d
|
||||||
|
.downcast_ref::<tracing::subscriber::NoSubscriber>()
|
||||||
|
.is_some();
|
||||||
|
});
|
||||||
|
|
||||||
|
!noop_configured
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
@@ -123,6 +179,36 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
collections::HashSet,
|
||||||
|
fmt::{self},
|
||||||
|
hash::{Hash, Hasher},
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||||
|
|
||||||
|
impl<'a> MemoryIdentity<'a> {
|
||||||
|
fn as_ptr(&self) -> *const () {
|
||||||
|
self.0 as *const _ as *const ()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.as_ptr() == other.as_ptr()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||||
|
impl<'a> Hash for MemoryIdentity<'a> {
|
||||||
|
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||||
|
self.as_ptr().hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct Setup {
|
struct Setup {
|
||||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||||
tenant_extractor: MultiNameExtractor<2>,
|
tenant_extractor: MultiNameExtractor<2>,
|
||||||
@@ -159,7 +245,8 @@ mod tests {
|
|||||||
let setup = setup_current_thread();
|
let setup = setup_current_thread();
|
||||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -167,8 +254,8 @@ mod tests {
|
|||||||
let setup = setup_current_thread();
|
let setup = setup_current_thread();
|
||||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
let missing =
|
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
|
||||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
|
.unwrap_err();
|
||||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,7 +272,8 @@ mod tests {
|
|||||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -198,7 +286,7 @@ mod tests {
|
|||||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -207,7 +295,8 @@ mod tests {
|
|||||||
let setup = setup_current_thread();
|
let setup = setup_current_thread();
|
||||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -223,7 +312,8 @@ mod tests {
|
|||||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -231,7 +321,7 @@ mod tests {
|
|||||||
let setup = setup_current_thread();
|
let setup = setup_current_thread();
|
||||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -245,43 +335,107 @@ mod tests {
|
|||||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tracing_error_subscriber_not_set_up() {
|
fn tracing_error_subscriber_not_set_up_straight_line() {
|
||||||
// no setup
|
// no setup
|
||||||
|
|
||||||
let span = tracing::info_span!("foo", e = "some value");
|
let span = tracing::info_span!("foo", e = "some value");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
let res = check_fields_present0([&extractor]);
|
||||||
assert_missing(missing, vec![&extractor]);
|
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||||
|
|
||||||
|
// similarly for a not found key
|
||||||
|
let extractor = MultiNameExtractor::new("F", ["foobar"]);
|
||||||
|
let res = check_fields_present0([&extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
fn tracing_error_subscriber_not_set_up_with_instrument() {
|
||||||
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
|
// no setup
|
||||||
|
|
||||||
|
// demo a case where span entering is used to establish a parent child connection, but
|
||||||
|
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
|
||||||
|
let span = tracing::info_span!("foo", e = "some value");
|
||||||
|
let _guard = span.enter();
|
||||||
|
|
||||||
|
let subspan = tracing::info_span!("bar", f = "foobar");
|
||||||
|
drop(_guard);
|
||||||
|
|
||||||
|
// normally this would work, but without any tracing-subscriber configured, both
|
||||||
|
// check_field_present find nothing
|
||||||
|
let _guard = subspan.enter();
|
||||||
|
let extractors: [&dyn Extractor; 2] = [
|
||||||
|
&MultiNameExtractor::new("E", ["e"]),
|
||||||
|
&MultiNameExtractor::new("F", ["f"]),
|
||||||
|
];
|
||||||
|
|
||||||
|
let res = check_fields_present0(extractors);
|
||||||
|
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||||
|
|
||||||
|
// similarly for a not found key
|
||||||
|
let extractor = MultiNameExtractor::new("G", ["g"]);
|
||||||
|
let res = check_fields_present0([&extractor]);
|
||||||
|
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tracing_subscriber_configured() {
|
||||||
|
// this will fail if any utils::logging::init callers appear, but let's hope they do not
|
||||||
|
// appear.
|
||||||
|
assert!(!super::tracing_subscriber_configured());
|
||||||
|
|
||||||
|
let _g = setup_current_thread();
|
||||||
|
|
||||||
|
assert!(super::tracing_subscriber_configured());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn not_found_when_disabled_by_filter() {
|
||||||
let r = tracing_subscriber::registry().with({
|
let r = tracing_subscriber::registry().with({
|
||||||
tracing_error::ErrorLayer::default().with_filter(
|
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
|
||||||
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
|
|md| !(md.is_span() && *md.level() == tracing::Level::INFO),
|
||||||
if md.is_span() && *md.level() == tracing::Level::INFO {
|
))
|
||||||
return false;
|
|
||||||
}
|
|
||||||
true
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
});
|
});
|
||||||
|
|
||||||
let _guard = tracing::subscriber::set_default(r);
|
let _guard = tracing::subscriber::set_default(r);
|
||||||
|
|
||||||
|
// this test is a rather tricky one, it has a number of possible outcomes depending on the
|
||||||
|
// execution order when executed with other tests even if no test sets the global default
|
||||||
|
// subscriber.
|
||||||
|
|
||||||
let span = tracing::info_span!("foo", e = "some value");
|
let span = tracing::info_span!("foo", e = "some value");
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
|
||||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
|
||||||
assert_missing(missing, vec![&extractor]);
|
if span.is_disabled() {
|
||||||
|
// the tests are running single threaded, or we got lucky and no other tests subscriber
|
||||||
|
// was got to register their per-CALLSITE::META interest between `set_default` and
|
||||||
|
// creation of the span, thus the filter got to apply and registered interest of Never,
|
||||||
|
// so the span was never created.
|
||||||
|
//
|
||||||
|
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
|
||||||
|
// to find an error.
|
||||||
|
|
||||||
|
let missing = check_fields_present0(extractors).unwrap_err();
|
||||||
|
assert_missing(missing, vec![extractors[0]]);
|
||||||
|
} else {
|
||||||
|
// when the span is enabled, it is because some other test is running at the same time,
|
||||||
|
// and that tests registry has filters which are interested in our above span.
|
||||||
|
//
|
||||||
|
// because the span is now enabled, all keys will be found for it. the
|
||||||
|
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
|
||||||
|
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
|
||||||
|
// this test-induced issue.
|
||||||
|
|
||||||
|
let res = check_fields_present0(extractors);
|
||||||
|
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
async-compression.workspace = true
|
||||||
async-stream.workspace = true
|
async-stream.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
@@ -24,6 +25,7 @@ consumption_metrics.workspace = true
|
|||||||
crc32c.workspace = true
|
crc32c.workspace = true
|
||||||
crossbeam-utils.workspace = true
|
crossbeam-utils.workspace = true
|
||||||
either.workspace = true
|
either.workspace = true
|
||||||
|
flate2.workspace = true
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
|
use pageserver::tenant::storage_layer::LayerFileName;
|
||||||
use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
|
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -28,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
|||||||
for fname in filenames {
|
for fname in filenames {
|
||||||
let fname = fname.unwrap();
|
let fname = fname.unwrap();
|
||||||
let fname = LayerFileName::from_str(&fname).unwrap();
|
let fname = LayerFileName::from_str(&fname).unwrap();
|
||||||
let layer = LayerDescriptor::from(fname);
|
let layer = PersistentLayerDesc::from(fname);
|
||||||
|
|
||||||
let lsn_range = layer.get_lsn_range();
|
let lsn_range = layer.get_lsn_range();
|
||||||
min_lsn = min(min_lsn, lsn_range.start);
|
min_lsn = min(min_lsn, lsn_range.start);
|
||||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||||
|
|
||||||
updates.insert_historic(layer.layer_desc().clone());
|
updates.insert_historic(layer);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("min: {min_lsn}, max: {max_lsn}");
|
println!("min: {min_lsn}, max: {max_lsn}");
|
||||||
@@ -210,15 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
|
|||||||
for i in 0..100_000 {
|
for i in 0..100_000 {
|
||||||
let i32 = (i as u32) % 100;
|
let i32 = (i as u32) % 100;
|
||||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||||
let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
|
let layer = PersistentLayerDesc::new_img(
|
||||||
TenantId::generate(),
|
TenantId::generate(),
|
||||||
TimelineId::generate(),
|
TimelineId::generate(),
|
||||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||||
Lsn(i),
|
Lsn(i),
|
||||||
false,
|
false,
|
||||||
0,
|
0,
|
||||||
));
|
);
|
||||||
updates.insert_historic(layer.layer_desc().clone());
|
updates.insert_historic(layer);
|
||||||
}
|
}
|
||||||
updates.flush();
|
updates.flush();
|
||||||
println!("Finished layer map init in {:?}", now.elapsed());
|
println!("Finished layer map init in {:?}", now.elapsed());
|
||||||
|
|||||||
@@ -7,10 +7,10 @@
|
|||||||
//! - The y axis represents LSN, growing upwards.
|
//! - The y axis represents LSN, growing upwards.
|
||||||
//!
|
//!
|
||||||
//! Coordinates in both axis are compressed for better readability.
|
//! Coordinates in both axis are compressed for better readability.
|
||||||
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
|
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||||
//!
|
//!
|
||||||
//! Example use:
|
//! Example use:
|
||||||
//! ```
|
//! ```bash
|
||||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||||
//! $ firefox out.svg
|
//! $ firefox out.svg
|
||||||
@@ -20,7 +20,7 @@
|
|||||||
//! or from pageserver log files.
|
//! or from pageserver log files.
|
||||||
//!
|
//!
|
||||||
//! TODO Consider shipping this as a grafana panel plugin:
|
//! TODO Consider shipping this as a grafana panel plugin:
|
||||||
//! https://grafana.com/tutorials/build-a-panel-plugin/
|
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
@@ -117,7 +117,8 @@ pub fn main() -> Result<()> {
|
|||||||
|
|
||||||
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
||||||
let mut fill = Fill::None;
|
let mut fill = Fill::None;
|
||||||
let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||||
|
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
|
||||||
let mut lsn_offset = 0.0;
|
let mut lsn_offset = 0.0;
|
||||||
|
|
||||||
// Fill in and thicken rectangle if it's an
|
// Fill in and thicken rectangle if it's an
|
||||||
@@ -128,7 +129,7 @@ pub fn main() -> Result<()> {
|
|||||||
num_images += 1;
|
num_images += 1;
|
||||||
lsn_diff = 0.3;
|
lsn_diff = 0.3;
|
||||||
lsn_offset = -lsn_diff / 2.0;
|
lsn_offset = -lsn_diff / 2.0;
|
||||||
margin = 0.05;
|
ymargin = 0.05;
|
||||||
fill = Fill::Color(rgb(0, 0, 0));
|
fill = Fill::Color(rgb(0, 0, 0));
|
||||||
}
|
}
|
||||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||||
@@ -137,10 +138,10 @@ pub fn main() -> Result<()> {
|
|||||||
println!(
|
println!(
|
||||||
" {}",
|
" {}",
|
||||||
rectangle(
|
rectangle(
|
||||||
key_start as f32 + stretch * margin,
|
key_start as f32 + stretch * xmargin,
|
||||||
stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
|
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
||||||
key_diff as f32 - stretch * 2.0 * margin,
|
key_diff as f32 - stretch * 2.0 * xmargin,
|
||||||
stretch * (lsn_diff - 2.0 * margin)
|
stretch * (lsn_diff - 2.0 * ymargin)
|
||||||
)
|
)
|
||||||
.fill(fill)
|
.fill(fill)
|
||||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||||
|
|||||||
@@ -171,11 +171,13 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
pub log_format: LogFormat,
|
pub log_format: LogFormat,
|
||||||
|
|
||||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||||
/// See the comment in `eviction_task` for details.
|
/// See the comment in `eviction_task` for details.
|
||||||
|
///
|
||||||
|
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||||
|
|
||||||
// How often to collect metrics and send them to the metrics endpoint.
|
// How often to collect metrics and send them to the metrics endpoint.
|
||||||
@@ -570,21 +572,21 @@ impl PageServerConf {
|
|||||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
|
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||||
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
/// where certain tenant's tenantconf file should be located.
|
/// where certain tenant's tenantconf file should be located.
|
||||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||||
self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
|
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
|
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||||
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
|
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
|
||||||
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -594,7 +596,7 @@ impl PageServerConf {
|
|||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> PathBuf {
|
) -> PathBuf {
|
||||||
path_with_suffix_extension(
|
path_with_suffix_extension(
|
||||||
self.timeline_path(&timeline_id, &tenant_id),
|
self.timeline_path(&tenant_id, &timeline_id),
|
||||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -617,8 +619,8 @@ impl PageServerConf {
|
|||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
/// where certain timeline's metadata file should be located.
|
/// where certain timeline's metadata file should be located.
|
||||||
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
|
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
|
||||||
self.timeline_path(&timeline_id, &tenant_id)
|
self.timeline_path(tenant_id, timeline_id)
|
||||||
.join(METADATA_FILE_NAME)
|
.join(METADATA_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -653,6 +655,7 @@ impl PageServerConf {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
|
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -661,6 +664,7 @@ impl PageServerConf {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
|
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -668,6 +672,7 @@ impl PageServerConf {
|
|||||||
match pg_version {
|
match pg_version {
|
||||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
|
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -993,6 +998,8 @@ impl ConfigurableSemaphore {
|
|||||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||||
|
///
|
||||||
|
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||||
ConfigurableSemaphore {
|
ConfigurableSemaphore {
|
||||||
initial_permits,
|
initial_permits,
|
||||||
|
|||||||
@@ -179,6 +179,9 @@ impl RequestContext {
|
|||||||
/// a context and you are unwilling to change all callers to provide one.
|
/// a context and you are unwilling to change all callers to provide one.
|
||||||
///
|
///
|
||||||
/// Before we add cancellation, we should get rid of this method.
|
/// Before we add cancellation, we should get rid of this method.
|
||||||
|
///
|
||||||
|
/// [`attached_child`]: Self::attached_child
|
||||||
|
/// [`detached_child`]: Self::detached_child
|
||||||
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||||
Self::new(task_kind, download_behavior)
|
Self::new(task_kind, download_behavior)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -305,7 +305,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||||
debug!(
|
debug!(
|
||||||
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
|
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||||
i + 1,
|
i + 1,
|
||||||
candidates.len(),
|
candidates.len(),
|
||||||
candidate.layer.file_size(),
|
candidate.layer.file_size(),
|
||||||
|
|||||||
@@ -346,7 +346,7 @@ async fn timeline_create_handler(
|
|||||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
.instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -381,7 +381,7 @@ async fn timeline_list_handler(
|
|||||||
}
|
}
|
||||||
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
.instrument(info_span!("timeline_list", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, response_data)
|
json_response(StatusCode::OK, response_data)
|
||||||
@@ -418,7 +418,7 @@ async fn timeline_detail_handler(
|
|||||||
|
|
||||||
Ok::<_, ApiError>(timeline_info)
|
Ok::<_, ApiError>(timeline_info)
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, timeline_info)
|
json_response(StatusCode::OK, timeline_info)
|
||||||
@@ -479,7 +479,7 @@ async fn tenant_attach_handler(
|
|||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
} else {
|
} else {
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
@@ -501,7 +501,7 @@ async fn timeline_delete_handler(
|
|||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
|
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
|
||||||
@@ -519,7 +519,7 @@ async fn tenant_detach_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let conf = state.conf;
|
let conf = state.conf;
|
||||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -542,7 +542,7 @@ async fn tenant_load_handler(
|
|||||||
state.remote_storage.clone(),
|
state.remote_storage.clone(),
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("load", tenant = %tenant_id))
|
.instrument(info_span!("load", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
@@ -558,7 +558,7 @@ async fn tenant_ignore_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let conf = state.conf;
|
let conf = state.conf;
|
||||||
mgr::ignore_tenant(conf, tenant_id)
|
mgr::ignore_tenant(conf, tenant_id)
|
||||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
.instrument(info_span!("ignore_tenant", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -611,7 +611,7 @@ async fn tenant_status(
|
|||||||
attachment_status: state.attachment_status(),
|
attachment_status: state.attachment_status(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
.instrument(info_span!("tenant_status_handler", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, tenant_info)
|
json_response(StatusCode::OK, tenant_info)
|
||||||
@@ -850,7 +850,7 @@ async fn tenant_create_handler(
|
|||||||
state.remote_storage.clone(),
|
state.remote_storage.clone(),
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// We created the tenant. Existing API semantics are that the tenant
|
// We created the tenant. Existing API semantics are that the tenant
|
||||||
@@ -912,7 +912,7 @@ async fn update_tenant_config_handler(
|
|||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
.instrument(info_span!("tenant_config", %tenant_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -1143,7 +1143,7 @@ async fn disk_usage_eviction_run(
|
|||||||
let Some(storage) = state.remote_storage.clone() else {
|
let Some(storage) = state.remote_storage.clone() else {
|
||||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||||
"remote storage not configured, cannot run eviction iteration"
|
"remote storage not configured, cannot run eviction iteration"
|
||||||
)))
|
)));
|
||||||
};
|
};
|
||||||
|
|
||||||
let state = state.disk_usage_eviction_state.clone();
|
let state = state.disk_usage_eviction_state.clone();
|
||||||
|
|||||||
@@ -385,7 +385,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct EvictionsWithLowResidenceDuration {
|
pub struct EvictionsWithLowResidenceDuration {
|
||||||
data_source: &'static str,
|
data_source: &'static str,
|
||||||
@@ -541,6 +541,17 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// keep in sync with control plane Go code so that we can validate
|
||||||
|
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||||
|
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||||
|
// Go code uses milliseconds. Variable is called `computeStartupBuckets`
|
||||||
|
[
|
||||||
|
5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
|
||||||
|
1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
|
||||||
|
]
|
||||||
|
.map(|ms| (ms as f64) / 1000.0)
|
||||||
|
});
|
||||||
|
|
||||||
pub struct BasebackupQueryTime(HistogramVec);
|
pub struct BasebackupQueryTime(HistogramVec);
|
||||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||||
BasebackupQueryTime({
|
BasebackupQueryTime({
|
||||||
@@ -548,7 +559,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
|||||||
"pageserver_basebackup_query_seconds",
|
"pageserver_basebackup_query_seconds",
|
||||||
"Histogram of basebackup queries durations, by result type",
|
"Histogram of basebackup queries durations, by result type",
|
||||||
&["result"],
|
&["result"],
|
||||||
CRITICAL_OP_BUCKETS.into(),
|
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
})
|
})
|
||||||
@@ -818,7 +829,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
|
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||||
pub struct StorageTimeMetricsTimer {
|
pub struct StorageTimeMetricsTimer {
|
||||||
metrics: StorageTimeMetrics,
|
metrics: StorageTimeMetrics,
|
||||||
start: Instant,
|
start: Instant,
|
||||||
@@ -876,7 +887,7 @@ impl StorageTimeMetrics {
|
|||||||
|
|
||||||
/// Starts timing a new operation.
|
/// Starts timing a new operation.
|
||||||
///
|
///
|
||||||
/// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
|
/// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
|
||||||
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
|
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
|
||||||
StorageTimeMetricsTimer::new(self.clone())
|
StorageTimeMetricsTimer::new(self.clone())
|
||||||
}
|
}
|
||||||
@@ -1256,7 +1267,7 @@ impl RemoteTimelineClientMetrics {
|
|||||||
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
||||||
///
|
///
|
||||||
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
||||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
|
||||||
/// is more suitable.
|
/// is more suitable.
|
||||||
/// Never do both.
|
/// Never do both.
|
||||||
pub(crate) fn call_begin(
|
pub(crate) fn call_begin(
|
||||||
@@ -1289,7 +1300,7 @@ impl RemoteTimelineClientMetrics {
|
|||||||
|
|
||||||
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
||||||
/// Using the guard object is generally preferable.
|
/// Using the guard object is generally preferable.
|
||||||
/// See [`call_begin`] for more context.
|
/// See [`call_begin`](Self::call_begin) for more context.
|
||||||
pub(crate) fn call_end(
|
pub(crate) fn call_end(
|
||||||
&self,
|
&self,
|
||||||
file_kind: &RemoteOpFileKind,
|
file_kind: &RemoteOpFileKind,
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
//
|
//
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use async_compression::tokio::write::GzipEncoder;
|
||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::Stream;
|
use futures::Stream;
|
||||||
@@ -31,8 +32,10 @@ use std::str;
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio_util::io::StreamReader;
|
use tokio_util::io::StreamReader;
|
||||||
|
use tracing::field;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::ConnectionId;
|
use utils::id::ConnectionId;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -51,6 +54,7 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
|||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant;
|
use crate::tenant;
|
||||||
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::mgr;
|
||||||
use crate::tenant::mgr::GetTenantError;
|
use crate::tenant::mgr::GetTenantError;
|
||||||
use crate::tenant::{Tenant, Timeline};
|
use crate::tenant::{Tenant, Timeline};
|
||||||
@@ -238,6 +242,7 @@ pub async fn libpq_listener_main(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(peer_addr))]
|
||||||
async fn page_service_conn_main(
|
async fn page_service_conn_main(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
@@ -260,6 +265,7 @@ async fn page_service_conn_main(
|
|||||||
.context("could not set TCP_NODELAY")?;
|
.context("could not set TCP_NODELAY")?;
|
||||||
|
|
||||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||||
|
tracing::Span::current().record("peer_addr", field::display(peer_addr));
|
||||||
|
|
||||||
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
||||||
// - long enough for most valid compute connections
|
// - long enough for most valid compute connections
|
||||||
@@ -362,7 +368,7 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip_all)]
|
||||||
async fn handle_pagerequests<IO>(
|
async fn handle_pagerequests<IO>(
|
||||||
&self,
|
&self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
@@ -373,6 +379,8 @@ impl PageServerHandler {
|
|||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// NOTE: pagerequests handler exits when connection is closed,
|
// NOTE: pagerequests handler exits when connection is closed,
|
||||||
// so there is no need to reset the association
|
// so there is no need to reset the association
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
@@ -473,7 +481,7 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
|
||||||
async fn handle_import_basebackup<IO>(
|
async fn handle_import_basebackup<IO>(
|
||||||
&self,
|
&self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
@@ -487,6 +495,8 @@ impl PageServerHandler {
|
|||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
// Create empty timeline
|
// Create empty timeline
|
||||||
info!("creating new timeline");
|
info!("creating new timeline");
|
||||||
@@ -531,7 +541,7 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip_all, fields(%start_lsn, %end_lsn))]
|
||||||
async fn handle_import_wal<IO>(
|
async fn handle_import_wal<IO>(
|
||||||
&self,
|
&self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
@@ -544,6 +554,7 @@ impl PageServerHandler {
|
|||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
|
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||||
@@ -738,7 +749,7 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
|
||||||
async fn handle_basebackup_request<IO>(
|
async fn handle_basebackup_request<IO>(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
@@ -747,11 +758,14 @@ impl PageServerHandler {
|
|||||||
lsn: Option<Lsn>,
|
lsn: Option<Lsn>,
|
||||||
prev_lsn: Option<Lsn>,
|
prev_lsn: Option<Lsn>,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
|
gzip: bool,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let started = std::time::Instant::now();
|
let started = std::time::Instant::now();
|
||||||
|
|
||||||
// check that the timeline exists
|
// check that the timeline exists
|
||||||
@@ -772,8 +786,9 @@ impl PageServerHandler {
|
|||||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||||
pgb.flush().await?;
|
pgb.flush().await?;
|
||||||
|
|
||||||
// Send a tarball of the latest layer on the timeline
|
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||||
{
|
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||||
|
if full_backup {
|
||||||
let mut writer = pgb.copyout_writer();
|
let mut writer = pgb.copyout_writer();
|
||||||
basebackup::send_basebackup_tarball(
|
basebackup::send_basebackup_tarball(
|
||||||
&mut writer,
|
&mut writer,
|
||||||
@@ -784,6 +799,40 @@ impl PageServerHandler {
|
|||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
} else {
|
||||||
|
let mut writer = pgb.copyout_writer();
|
||||||
|
if gzip {
|
||||||
|
let mut encoder = GzipEncoder::with_quality(
|
||||||
|
writer,
|
||||||
|
// NOTE using fast compression because it's on the critical path
|
||||||
|
// for compute startup. For an empty database, we get
|
||||||
|
// <100KB with this method. The Level::Best compression method
|
||||||
|
// gives us <20KB, but maybe we should add basebackup caching
|
||||||
|
// on compute shutdown first.
|
||||||
|
async_compression::Level::Fastest,
|
||||||
|
);
|
||||||
|
basebackup::send_basebackup_tarball(
|
||||||
|
&mut encoder,
|
||||||
|
&timeline,
|
||||||
|
lsn,
|
||||||
|
prev_lsn,
|
||||||
|
full_backup,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
// shutdown the encoder to ensure the gzip footer is written
|
||||||
|
encoder.shutdown().await?;
|
||||||
|
} else {
|
||||||
|
basebackup::send_basebackup_tarball(
|
||||||
|
&mut writer,
|
||||||
|
&timeline,
|
||||||
|
lsn,
|
||||||
|
prev_lsn,
|
||||||
|
full_backup,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||||
@@ -862,6 +911,7 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(tenant_id, timeline_id))]
|
||||||
async fn process_query(
|
async fn process_query(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
@@ -883,6 +933,10 @@ where
|
|||||||
let timeline_id = TimelineId::from_str(params[1])
|
let timeline_id = TimelineId::from_str(params[1])
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
|
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
|
||||||
@@ -902,6 +956,10 @@ where
|
|||||||
let timeline_id = TimelineId::from_str(params[1])
|
let timeline_id = TimelineId::from_str(params[1])
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
let lsn = if params.len() >= 3 {
|
let lsn = if params.len() >= 3 {
|
||||||
@@ -913,6 +971,19 @@ where
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let gzip = if params.len() >= 4 {
|
||||||
|
if params[3] == "--gzip" {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
return Err(QueryError::Other(anyhow::anyhow!(
|
||||||
|
"Parameter in position 3 unknown {}",
|
||||||
|
params[3],
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||||
async move {
|
async move {
|
||||||
@@ -923,6 +994,7 @@ where
|
|||||||
lsn,
|
lsn,
|
||||||
None,
|
None,
|
||||||
false,
|
false,
|
||||||
|
gzip,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -948,6 +1020,10 @@ where
|
|||||||
let timeline_id = TimelineId::from_str(params[1])
|
let timeline_id = TimelineId::from_str(params[1])
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||||
|
|
||||||
@@ -979,6 +1055,10 @@ where
|
|||||||
let timeline_id = TimelineId::from_str(params[1])
|
let timeline_id = TimelineId::from_str(params[1])
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||||
let lsn = if params.len() > 2 {
|
let lsn = if params.len() > 2 {
|
||||||
Some(
|
Some(
|
||||||
@@ -1000,8 +1080,17 @@ where
|
|||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
|
self.handle_basebackup_request(
|
||||||
.await?;
|
pgb,
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
lsn,
|
||||||
|
prev_lsn,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
} else if query_string.starts_with("import basebackup ") {
|
} else if query_string.starts_with("import basebackup ") {
|
||||||
// Import the `base` section (everything but the wal) of a basebackup.
|
// Import the `base` section (everything but the wal) of a basebackup.
|
||||||
@@ -1033,6 +1122,10 @@ where
|
|||||||
let pg_version = u32::from_str(params[4])
|
let pg_version = u32::from_str(params[4])
|
||||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
match self
|
match self
|
||||||
@@ -1077,6 +1170,10 @@ where
|
|||||||
let end_lsn = Lsn::from_str(params[3])
|
let end_lsn = Lsn::from_str(params[3])
|
||||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
match self
|
match self
|
||||||
@@ -1108,6 +1205,8 @@ where
|
|||||||
let tenant_id = TenantId::from_str(params[0])
|
let tenant_id = TenantId::from_str(params[0])
|
||||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||||
|
|
||||||
|
tracing::Span::current().record("tenant_id", field::display(tenant_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||||
|
|||||||
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||||
/// whole import fails and the timeline will be deleted anyway.
|
/// whole import fails and the timeline will be deleted anyway.
|
||||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
/// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
|
||||||
///
|
///
|
||||||
/// Note: A consequence of flushing the pending operations is that they
|
/// Note: A consequence of flushing the pending operations is that they
|
||||||
/// won't be visible to subsequent operations until `commit`. The function
|
/// won't be visible to subsequent operations until `commit`. The function
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ pub enum TaskKind {
|
|||||||
///
|
///
|
||||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||||
/// That abstraction doesn't use `task_mgr`.
|
/// That abstraction doesn't use `task_mgr`.
|
||||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||||
///
|
///
|
||||||
/// Once the connection is established, the `TaskHandle` task creates a
|
/// Once the connection is established, the `TaskHandle` task creates a
|
||||||
@@ -213,16 +213,21 @@ pub enum TaskKind {
|
|||||||
/// the `Connection` object.
|
/// the `Connection` object.
|
||||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||||
|
///
|
||||||
|
/// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
|
||||||
|
/// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
|
||||||
WalReceiverManager,
|
WalReceiverManager,
|
||||||
|
|
||||||
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
|
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||||
/// See the comment on [`WalReceiverManager`].
|
/// See the comment on [`WalReceiverManager`].
|
||||||
|
///
|
||||||
|
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||||
WalReceiverConnectionHandler,
|
WalReceiverConnectionHandler,
|
||||||
|
|
||||||
/// The task that polls the `tokio-postgres::Connection` object.
|
/// The task that polls the `tokio-postgres::Connection` object.
|
||||||
/// Spawned by task [`WalReceiverConnectionHandler`].
|
/// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
|
||||||
/// See the comment on [`WalReceiverManager`].
|
/// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
|
||||||
WalReceiverConnectionPoller,
|
WalReceiverConnectionPoller,
|
||||||
|
|
||||||
// Garbage collection worker. One per tenant
|
// Garbage collection worker. One per tenant
|
||||||
|
|||||||
@@ -84,6 +84,25 @@ use utils::{
|
|||||||
lsn::{Lsn, RecordLsn},
|
lsn::{Lsn, RecordLsn},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||||
|
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||||
|
macro_rules! pausable_failpoint {
|
||||||
|
($name:literal) => {
|
||||||
|
if cfg!(feature = "testing") {
|
||||||
|
tokio::task::spawn_blocking({
|
||||||
|
let current = tracing::Span::current();
|
||||||
|
move || {
|
||||||
|
let _entered = current.entered();
|
||||||
|
tracing::info!("at failpoint {}", $name);
|
||||||
|
fail::fail_point!($name);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.expect("spawn_blocking");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub mod blob_io;
|
pub mod blob_io;
|
||||||
pub mod block_io;
|
pub mod block_io;
|
||||||
pub mod disk_btree;
|
pub mod disk_btree;
|
||||||
@@ -114,7 +133,7 @@ pub use timeline::{
|
|||||||
// re-export this function so that page_cache.rs can use it.
|
// re-export this function so that page_cache.rs can use it.
|
||||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||||
|
|
||||||
// re-export for use in storage_sync.rs
|
// re-export for use in remote_timeline_client.rs
|
||||||
pub use crate::tenant::metadata::save_metadata;
|
pub use crate::tenant::metadata::save_metadata;
|
||||||
|
|
||||||
// re-export for use in walreceiver
|
// re-export for use in walreceiver
|
||||||
@@ -410,7 +429,7 @@ impl Tenant {
|
|||||||
.layers
|
.layers
|
||||||
.read()
|
.read()
|
||||||
.await
|
.await
|
||||||
.0
|
.layer_map()
|
||||||
.iter_historic_layers()
|
.iter_historic_layers()
|
||||||
.next()
|
.next()
|
||||||
.is_some(),
|
.is_some(),
|
||||||
@@ -421,8 +440,8 @@ impl Tenant {
|
|||||||
if !picked_local {
|
if !picked_local {
|
||||||
save_metadata(
|
save_metadata(
|
||||||
self.conf,
|
self.conf,
|
||||||
timeline_id,
|
&tenant_id,
|
||||||
tenant_id,
|
&timeline_id,
|
||||||
up_to_date_metadata,
|
up_to_date_metadata,
|
||||||
first_save,
|
first_save,
|
||||||
)
|
)
|
||||||
@@ -451,7 +470,7 @@ impl Tenant {
|
|||||||
) -> anyhow::Result<Arc<Tenant>> {
|
) -> anyhow::Result<Arc<Tenant>> {
|
||||||
// TODO dedup with spawn_load
|
// TODO dedup with spawn_load
|
||||||
let tenant_conf =
|
let tenant_conf =
|
||||||
Self::load_tenant_config(conf, tenant_id).context("load tenant config")?;
|
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
|
||||||
|
|
||||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||||
let tenant = Arc::new(Tenant::new(
|
let tenant = Arc::new(Tenant::new(
|
||||||
@@ -560,7 +579,7 @@ impl Tenant {
|
|||||||
.map(move |res| {
|
.map(move |res| {
|
||||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||||
})
|
})
|
||||||
.instrument(info_span!("download_index_part", timeline=%timeline_id)),
|
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// Wait for all the download tasks to complete & collect results.
|
// Wait for all the download tasks to complete & collect results.
|
||||||
@@ -646,7 +665,7 @@ impl Tenant {
|
|||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
info!("downloading index file for timeline {}", timeline_id);
|
info!("downloading index file for timeline {}", timeline_id);
|
||||||
tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
|
tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_id, &timeline_id))
|
||||||
.await
|
.await
|
||||||
.context("Failed to create new timeline directory")?;
|
.context("Failed to create new timeline directory")?;
|
||||||
|
|
||||||
@@ -724,7 +743,7 @@ impl Tenant {
|
|||||||
) -> Arc<Tenant> {
|
) -> Arc<Tenant> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
|
let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) {
|
||||||
Ok(conf) => conf,
|
Ok(conf) => conf,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("load tenant config failed: {:?}", e);
|
error!("load tenant config failed: {:?}", e);
|
||||||
@@ -835,7 +854,7 @@ impl Tenant {
|
|||||||
timeline_uninit_mark_file.display()
|
timeline_uninit_mark_file.display()
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
|
||||||
if let Err(e) =
|
if let Err(e) =
|
||||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||||
{
|
{
|
||||||
@@ -880,7 +899,7 @@ impl Tenant {
|
|||||||
if let Ok(timeline_id) =
|
if let Ok(timeline_id) =
|
||||||
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
|
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
|
||||||
{
|
{
|
||||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
|
let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id)
|
||||||
.context("failed to load metadata")?;
|
.context("failed to load metadata")?;
|
||||||
timelines_to_load.insert(timeline_id, metadata);
|
timelines_to_load.insert(timeline_id, metadata);
|
||||||
} else {
|
} else {
|
||||||
@@ -1349,7 +1368,7 @@ impl Tenant {
|
|||||||
for (timeline_id, timeline) in &timelines_to_compact {
|
for (timeline_id, timeline) in &timelines_to_compact {
|
||||||
timeline
|
timeline
|
||||||
.compact(ctx)
|
.compact(ctx)
|
||||||
.instrument(info_span!("compact_timeline", timeline = %timeline_id))
|
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1440,12 +1459,12 @@ impl Tenant {
|
|||||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||||
|
|
||||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
// NB: remote_timeline_client upload tasks that reference these layers have been cancelled
|
||||||
// by the caller.
|
// by the caller.
|
||||||
|
|
||||||
let local_timeline_directory = self
|
let local_timeline_directory = self
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&timeline.timeline_id, &self.tenant_id);
|
.timeline_path(&self.tenant_id, &timeline.timeline_id);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||||
@@ -1498,20 +1517,7 @@ impl Tenant {
|
|||||||
remote_client.delete_all().await.context("delete_all")?
|
remote_client.delete_all().await.context("delete_all")?
|
||||||
};
|
};
|
||||||
|
|
||||||
// Have a failpoint that can use the `pause` failpoint action.
|
pausable_failpoint!("in_progress_delete");
|
||||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
|
||||||
if cfg!(feature = "testing") {
|
|
||||||
tokio::task::spawn_blocking({
|
|
||||||
let current = tracing::Span::current();
|
|
||||||
move || {
|
|
||||||
let _entered = current.entered();
|
|
||||||
tracing::info!("at failpoint in_progress_delete");
|
|
||||||
fail::fail_point!("in_progress_delete");
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.expect("spawn_blocking");
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
// Remove the timeline from the map.
|
// Remove the timeline from the map.
|
||||||
@@ -2226,7 +2232,7 @@ impl Tenant {
|
|||||||
/// Locate and load config
|
/// Locate and load config
|
||||||
pub(super) fn load_tenant_config(
|
pub(super) fn load_tenant_config(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_id: &TenantId,
|
||||||
) -> anyhow::Result<TenantConfOpt> {
|
) -> anyhow::Result<TenantConfOpt> {
|
||||||
let target_config_path = conf.tenant_config_path(tenant_id);
|
let target_config_path = conf.tenant_config_path(tenant_id);
|
||||||
let target_config_display = target_config_path.display();
|
let target_config_display = target_config_path.display();
|
||||||
@@ -2813,7 +2819,7 @@ impl Tenant {
|
|||||||
timeline_struct.init_empty_layer_map(start_lsn);
|
timeline_struct.init_empty_layer_map(start_lsn);
|
||||||
|
|
||||||
if let Err(e) =
|
if let Err(e) =
|
||||||
self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata)
|
self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
|
||||||
{
|
{
|
||||||
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
|
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||||
cleanup_timeline_directory(uninit_mark);
|
cleanup_timeline_directory(uninit_mark);
|
||||||
@@ -2832,7 +2838,7 @@ impl Tenant {
|
|||||||
fn create_timeline_files(
|
fn create_timeline_files(
|
||||||
&self,
|
&self,
|
||||||
timeline_path: &Path,
|
timeline_path: &Path,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: &TimelineId,
|
||||||
new_metadata: &TimelineMetadata,
|
new_metadata: &TimelineMetadata,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||||
@@ -2843,8 +2849,8 @@ impl Tenant {
|
|||||||
|
|
||||||
save_metadata(
|
save_metadata(
|
||||||
self.conf,
|
self.conf,
|
||||||
|
&self.tenant_id,
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
self.tenant_id,
|
|
||||||
new_metadata,
|
new_metadata,
|
||||||
true,
|
true,
|
||||||
)
|
)
|
||||||
@@ -2867,7 +2873,7 @@ impl Tenant {
|
|||||||
timelines.get(&timeline_id).is_none(),
|
timelines.get(&timeline_id).is_none(),
|
||||||
"Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory"
|
"Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory"
|
||||||
);
|
);
|
||||||
let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id);
|
let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id);
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!timeline_path.exists(),
|
!timeline_path.exists(),
|
||||||
"Timeline {} already exists, cannot create its uninit mark file",
|
"Timeline {} already exists, cannot create its uninit mark file",
|
||||||
@@ -2998,10 +3004,10 @@ pub(crate) enum CreateTenantFilesMode {
|
|||||||
pub(crate) fn create_tenant_files(
|
pub(crate) fn create_tenant_files(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: TenantConfOpt,
|
tenant_conf: TenantConfOpt,
|
||||||
tenant_id: TenantId,
|
tenant_id: &TenantId,
|
||||||
mode: CreateTenantFilesMode,
|
mode: CreateTenantFilesMode,
|
||||||
) -> anyhow::Result<PathBuf> {
|
) -> anyhow::Result<PathBuf> {
|
||||||
let target_tenant_directory = conf.tenant_path(&tenant_id);
|
let target_tenant_directory = conf.tenant_path(tenant_id);
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!target_tenant_directory
|
!target_tenant_directory
|
||||||
.try_exists()
|
.try_exists()
|
||||||
@@ -3052,7 +3058,7 @@ pub(crate) fn create_tenant_files(
|
|||||||
fn try_create_target_tenant_dir(
|
fn try_create_target_tenant_dir(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: TenantConfOpt,
|
tenant_conf: TenantConfOpt,
|
||||||
tenant_id: TenantId,
|
tenant_id: &TenantId,
|
||||||
mode: CreateTenantFilesMode,
|
mode: CreateTenantFilesMode,
|
||||||
temporary_tenant_dir: &Path,
|
temporary_tenant_dir: &Path,
|
||||||
target_tenant_directory: &Path,
|
target_tenant_directory: &Path,
|
||||||
@@ -3076,7 +3082,7 @@ fn try_create_target_tenant_dir(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let temporary_tenant_timelines_dir = rebase_directory(
|
let temporary_tenant_timelines_dir = rebase_directory(
|
||||||
&conf.timelines_path(&tenant_id),
|
&conf.timelines_path(tenant_id),
|
||||||
target_tenant_directory,
|
target_tenant_directory,
|
||||||
temporary_tenant_dir,
|
temporary_tenant_dir,
|
||||||
)
|
)
|
||||||
@@ -3088,7 +3094,7 @@ fn try_create_target_tenant_dir(
|
|||||||
)
|
)
|
||||||
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
|
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
|
||||||
|
|
||||||
Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
|
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
|
||||||
|
|
||||||
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
@@ -3376,7 +3382,7 @@ pub mod harness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf {
|
pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf {
|
||||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
self.conf.timeline_path(&self.tenant_id, timeline_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4329,13 +4335,13 @@ mod tests {
|
|||||||
// assert freeze_and_flush exercised the initdb optimization
|
// assert freeze_and_flush exercised the initdb optimization
|
||||||
{
|
{
|
||||||
let state = tline.flush_loop_state.lock().unwrap();
|
let state = tline.flush_loop_state.lock().unwrap();
|
||||||
let
|
let timeline::FlushLoopState::Running {
|
||||||
timeline::FlushLoopState::Running {
|
expect_initdb_optimization,
|
||||||
expect_initdb_optimization,
|
initdb_optimization_count,
|
||||||
initdb_optimization_count,
|
} = *state
|
||||||
} = *state else {
|
else {
|
||||||
panic!("unexpected state: {:?}", *state);
|
panic!("unexpected state: {:?}", *state);
|
||||||
};
|
};
|
||||||
assert!(expect_initdb_optimization);
|
assert!(expect_initdb_optimization);
|
||||||
assert!(initdb_optimization_count > 0);
|
assert!(initdb_optimization_count > 0);
|
||||||
}
|
}
|
||||||
@@ -4370,7 +4376,7 @@ mod tests {
|
|||||||
|
|
||||||
assert!(!harness
|
assert!(!harness
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&TIMELINE_ID, &tenant.tenant_id)
|
.timeline_path(&tenant.tenant_id, &TIMELINE_ID)
|
||||||
.exists());
|
.exists());
|
||||||
|
|
||||||
assert!(!harness
|
assert!(!harness
|
||||||
|
|||||||
@@ -442,7 +442,7 @@ where
|
|||||||
writer: W,
|
writer: W,
|
||||||
|
|
||||||
///
|
///
|
||||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
/// `stack[0]` is the current root page, `stack.last()` is the leaf.
|
||||||
///
|
///
|
||||||
/// We maintain the length of the stack to be always greater than zero.
|
/// We maintain the length of the stack to be always greater than zero.
|
||||||
/// Two exceptions are:
|
/// Two exceptions are:
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ impl EphemeralFile {
|
|||||||
l.next_file_id += 1;
|
l.next_file_id += 1;
|
||||||
|
|
||||||
let filename = conf
|
let filename = conf
|
||||||
.timeline_path(&timeline_id, &tenant_id)
|
.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
||||||
|
|
||||||
let file = VirtualFile::open_with_options(
|
let file = VirtualFile::open_with_options(
|
||||||
@@ -346,7 +346,7 @@ mod tests {
|
|||||||
|
|
||||||
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||||
fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
|
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
|
||||||
|
|
||||||
Ok((conf, tenant_id, timeline_id))
|
Ok((conf, tenant_id, timeline_id))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
//! Other read methods are less critical but still impact performance of background tasks.
|
//! Other read methods are less critical but still impact performance of background tasks.
|
||||||
//!
|
//!
|
||||||
//! This data structure relies on a persistent/immutable binary search tree. See the
|
//! This data structure relies on a persistent/immutable binary search tree. See the
|
||||||
//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||||
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
|
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
|
||||||
//! you to modify the tree in such a way that each modification creates a new "version"
|
//! you to modify the tree in such a way that each modification creates a new "version"
|
||||||
//! of the tree. When you modify it, you get a new version, but all previous versions are
|
//! of the tree. When you modify it, you get a new version, but all previous versions are
|
||||||
@@ -40,7 +40,7 @@
|
|||||||
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
|
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
|
||||||
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
|
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
|
||||||
//! to throw away most of the persistent BST and build a new one, starting from the oldest
|
//! to throw away most of the persistent BST and build a new one, starting from the oldest
|
||||||
//! LSN. See `LayerMap::flush_updates()`.
|
//! LSN. See [`LayerMap::flush_updates()`].
|
||||||
//!
|
//!
|
||||||
|
|
||||||
mod historic_layer_coverage;
|
mod historic_layer_coverage;
|
||||||
@@ -60,7 +60,6 @@ use utils::lsn::Lsn;
|
|||||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||||
pub use historic_layer_coverage::LayerKey;
|
pub use historic_layer_coverage::LayerKey;
|
||||||
|
|
||||||
use super::storage_layer::range_eq;
|
|
||||||
use super::storage_layer::PersistentLayerDesc;
|
use super::storage_layer::PersistentLayerDesc;
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -365,7 +364,7 @@ impl LayerMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||||
range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
|
layer.get_key_range() == (Key::MIN..Key::MAX)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function determines which layers are counted in `count_deltas`:
|
/// This function determines which layers are counted in `count_deltas`:
|
||||||
@@ -397,7 +396,7 @@ impl LayerMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Case 2
|
// Case 2
|
||||||
if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
|
if partition_range == &(Key::MIN..Key::MAX) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -652,19 +651,35 @@ impl LayerMap {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::LayerMap;
|
use super::LayerMap;
|
||||||
use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
mod l0_delta_layers_updated {
|
mod l0_delta_layers_updated {
|
||||||
|
|
||||||
use crate::tenant::{
|
use crate::tenant::{
|
||||||
storage_layer::{PersistentLayer, PersistentLayerDesc},
|
storage_layer::{AsLayerDesc, PersistentLayerDesc},
|
||||||
timeline::LayerFileManager,
|
timeline::layer_manager::LayerFileManager,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
struct LayerObject(PersistentLayerDesc);
|
||||||
|
|
||||||
|
impl AsLayerDesc for LayerObject {
|
||||||
|
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerObject {
|
||||||
|
fn new(desc: PersistentLayerDesc) -> Self {
|
||||||
|
LayerObject(desc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type TestLayerFileManager = LayerFileManager<LayerObject>;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn for_full_range_delta() {
|
fn for_full_range_delta() {
|
||||||
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
||||||
@@ -701,18 +716,18 @@ mod tests {
|
|||||||
|
|
||||||
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
||||||
let layer = LayerFileName::from_str(layer).unwrap();
|
let layer = LayerFileName::from_str(layer).unwrap();
|
||||||
let layer = LayerDescriptor::from(layer);
|
let layer = PersistentLayerDesc::from(layer);
|
||||||
|
|
||||||
// same skeletan construction; see scenario below
|
// same skeletan construction; see scenario below
|
||||||
let not_found = Arc::new(layer.clone());
|
let not_found = Arc::new(LayerObject::new(layer.clone()));
|
||||||
let new_version = Arc::new(layer);
|
let new_version = Arc::new(LayerObject::new(layer));
|
||||||
|
|
||||||
// after the immutable storage state refactor, the replace operation
|
// after the immutable storage state refactor, the replace operation
|
||||||
// will not use layer map any more. We keep it here for consistency in test cases
|
// will not use layer map any more. We keep it here for consistency in test cases
|
||||||
// and can remove it in the future.
|
// and can remove it in the future.
|
||||||
let _map = LayerMap::default();
|
let _map = LayerMap::default();
|
||||||
|
|
||||||
let mut mapping = LayerFileManager::new();
|
let mut mapping = TestLayerFileManager::new();
|
||||||
|
|
||||||
mapping
|
mapping
|
||||||
.replace_and_verify(not_found, new_version)
|
.replace_and_verify(not_found, new_version)
|
||||||
@@ -721,10 +736,10 @@ mod tests {
|
|||||||
|
|
||||||
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
||||||
let name = LayerFileName::from_str(layer_name).unwrap();
|
let name = LayerFileName::from_str(layer_name).unwrap();
|
||||||
let skeleton = LayerDescriptor::from(name);
|
let skeleton = PersistentLayerDesc::from(name);
|
||||||
|
|
||||||
let remote = Arc::new(skeleton.clone());
|
let remote = Arc::new(LayerObject::new(skeleton.clone()));
|
||||||
let downloaded = Arc::new(skeleton);
|
let downloaded = Arc::new(LayerObject::new(skeleton));
|
||||||
|
|
||||||
let mut map = LayerMap::default();
|
let mut map = LayerMap::default();
|
||||||
let mut mapping = LayerFileManager::new();
|
let mut mapping = LayerFileManager::new();
|
||||||
|
|||||||
@@ -122,8 +122,7 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
|
|||||||
self.head = self
|
self.head = self
|
||||||
.historic
|
.historic
|
||||||
.iter()
|
.iter()
|
||||||
.rev()
|
.next_back()
|
||||||
.next()
|
|
||||||
.map(|(_, v)| v.clone())
|
.map(|(_, v)| v.clone())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
}
|
}
|
||||||
@@ -412,7 +411,7 @@ fn test_persistent_overlapping() {
|
|||||||
/// still be more critical.
|
/// still be more critical.
|
||||||
///
|
///
|
||||||
/// See this for more on persistent and retroactive techniques:
|
/// See this for more on persistent and retroactive techniques:
|
||||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||||
historic_coverage: HistoricLayerCoverage<Value>,
|
historic_coverage: HistoricLayerCoverage<Value>,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::ops::Range;
|
|||||||
|
|
||||||
// NOTE the `im` crate has 20x more downloads and also has
|
// NOTE the `im` crate has 20x more downloads and also has
|
||||||
// persistent/immutable BTree. But it's bugged so rpds is a
|
// persistent/immutable BTree. But it's bugged so rpds is a
|
||||||
// better choice https://github.com/neondatabase/neon/issues/3395
|
// better choice <https://github.com/neondatabase/neon/issues/3395>
|
||||||
use rpds::RedBlackTreeMapSync;
|
use rpds::RedBlackTreeMapSync;
|
||||||
|
|
||||||
/// Data structure that can efficiently:
|
/// Data structure that can efficiently:
|
||||||
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
|
|||||||
/// - insert layers in non-decreasing lsn.start order
|
/// - insert layers in non-decreasing lsn.start order
|
||||||
///
|
///
|
||||||
/// For a detailed explanation and justification of this approach, see:
|
/// For a detailed explanation and justification of this approach, see:
|
||||||
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
|
/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
|
||||||
///
|
///
|
||||||
/// NOTE The struct is parameterized over Value for easier
|
/// NOTE The struct is parameterized over Value for easier
|
||||||
/// testing, but in practice it's some sort of layer.
|
/// testing, but in practice it's some sort of layer.
|
||||||
@@ -113,8 +113,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
|||||||
pub fn query(&self, key: i128) -> Option<Value> {
|
pub fn query(&self, key: i128) -> Option<Value> {
|
||||||
self.nodes
|
self.nodes
|
||||||
.range(..=key)
|
.range(..=key)
|
||||||
.rev()
|
.next_back()?
|
||||||
.next()?
|
|
||||||
.1
|
.1
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|(_, v)| v.clone())
|
.map(|(_, v)| v.clone())
|
||||||
|
|||||||
@@ -24,7 +24,7 @@
|
|||||||
//! Currently, this is not used in the system. Future refactors will ensure
|
//! Currently, this is not used in the system. Future refactors will ensure
|
||||||
//! the storage state will be recorded in this file, and the system can be
|
//! the storage state will be recorded in this file, and the system can be
|
||||||
//! recovered from this file. This is tracked in
|
//! recovered from this file. This is tracked in
|
||||||
//! https://github.com/neondatabase/neon/issues/4418
|
//! <https://github.com/neondatabase/neon/issues/4418>
|
||||||
|
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||||
//! has a metadata that needs to be stored persistently.
|
//! has a metadata that needs to be stored persistently.
|
||||||
//!
|
//!
|
||||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||||
//! external storage import and export operations.
|
//! external storage import and export operations.
|
||||||
//!
|
//!
|
||||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||||
|
//!
|
||||||
|
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||||
|
|
||||||
use std::fs::{File, OpenOptions};
|
use std::fs::{File, OpenOptions};
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -232,13 +234,13 @@ impl TimelineMetadata {
|
|||||||
/// Save timeline metadata to file
|
/// Save timeline metadata to file
|
||||||
pub fn save_metadata(
|
pub fn save_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
tenant_id: &TenantId,
|
||||||
tenant_id: TenantId,
|
timeline_id: &TimelineId,
|
||||||
data: &TimelineMetadata,
|
data: &TimelineMetadata,
|
||||||
first_save: bool,
|
first_save: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let _enter = info_span!("saving metadata").entered();
|
let _enter = info_span!("saving metadata").entered();
|
||||||
let path = conf.metadata_path(timeline_id, tenant_id);
|
let path = conf.metadata_path(tenant_id, timeline_id);
|
||||||
// use OpenOptions to ensure file presence is consistent with first_save
|
// use OpenOptions to ensure file presence is consistent with first_save
|
||||||
let mut file = VirtualFile::open_with_options(
|
let mut file = VirtualFile::open_with_options(
|
||||||
&path,
|
&path,
|
||||||
@@ -267,10 +269,10 @@ pub fn save_metadata(
|
|||||||
|
|
||||||
pub fn load_metadata(
|
pub fn load_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
tenant_id: &TenantId,
|
||||||
tenant_id: TenantId,
|
timeline_id: &TimelineId,
|
||||||
) -> anyhow::Result<TimelineMetadata> {
|
) -> anyhow::Result<TimelineMetadata> {
|
||||||
let metadata_path = conf.metadata_path(timeline_id, tenant_id);
|
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to read metadata bytes from path {}",
|
"Failed to read metadata bytes from path {}",
|
||||||
|
|||||||
@@ -184,9 +184,9 @@ pub fn schedule_local_tenant_processing(
|
|||||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
|
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
|
||||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -310,7 +310,7 @@ pub async fn create_tenant(
|
|||||||
// We're holding the tenants lock in write mode while doing local IO.
|
// We're holding the tenants lock in write mode while doing local IO.
|
||||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
|
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
|
||||||
// and do the work in that state.
|
// and do the work in that state.
|
||||||
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
|
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
|
||||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||||
// See https://github.com/neondatabase/neon/issues/4233
|
// See https://github.com/neondatabase/neon/issues/4233
|
||||||
|
|
||||||
@@ -344,14 +344,9 @@ pub async fn set_new_tenant_config(
|
|||||||
info!("configuring tenant {tenant_id}");
|
info!("configuring tenant {tenant_id}");
|
||||||
let tenant = get_tenant(tenant_id, true).await?;
|
let tenant = get_tenant(tenant_id, true).await?;
|
||||||
|
|
||||||
let tenant_config_path = conf.tenant_config_path(tenant_id);
|
let tenant_config_path = conf.tenant_config_path(&tenant_id);
|
||||||
Tenant::persist_tenant_config(
|
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
|
||||||
&tenant.tenant_id(),
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
&tenant_config_path,
|
|
||||||
new_tenant_conf,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.map_err(SetNewTenantConfigError::Persist)?;
|
|
||||||
tenant.set_new_tenant_config(new_tenant_conf);
|
tenant.set_new_tenant_config(new_tenant_conf);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -435,7 +430,7 @@ pub async fn detach_tenant(
|
|||||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||||
if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
|
if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||||
if tenant_ignore_mark.exists() {
|
if tenant_ignore_mark.exists() {
|
||||||
info!("Detaching an ignored tenant");
|
info!("Detaching an ignored tenant");
|
||||||
local_files_cleanup_operation(tenant_id)
|
local_files_cleanup_operation(tenant_id)
|
||||||
@@ -457,7 +452,7 @@ pub async fn load_tenant(
|
|||||||
) -> Result<(), TenantMapInsertError> {
|
) -> Result<(), TenantMapInsertError> {
|
||||||
tenant_map_insert(tenant_id, || {
|
tenant_map_insert(tenant_id, || {
|
||||||
let tenant_path = conf.tenant_path(&tenant_id);
|
let tenant_path = conf.tenant_path(&tenant_id);
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||||
if tenant_ignore_mark.exists() {
|
if tenant_ignore_mark.exists() {
|
||||||
std::fs::remove_file(&tenant_ignore_mark)
|
std::fs::remove_file(&tenant_ignore_mark)
|
||||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||||
@@ -478,7 +473,7 @@ pub async fn ignore_tenant(
|
|||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
) -> Result<(), TenantStateError> {
|
) -> Result<(), TenantStateError> {
|
||||||
remove_tenant_from_memory(tenant_id, async {
|
remove_tenant_from_memory(tenant_id, async {
|
||||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
|
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||||
fs::File::create(&ignore_mark_file)
|
fs::File::create(&ignore_mark_file)
|
||||||
.await
|
.await
|
||||||
.context("Failed to create ignore mark file")
|
.context("Failed to create ignore mark file")
|
||||||
@@ -525,7 +520,7 @@ pub async fn attach_tenant(
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), TenantMapInsertError> {
|
) -> Result<(), TenantMapInsertError> {
|
||||||
tenant_map_insert(tenant_id, || {
|
tenant_map_insert(tenant_id, || {
|
||||||
let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
|
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
|
||||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||||
// See https://github.com/neondatabase/neon/issues/4233
|
// See https://github.com/neondatabase/neon/issues/4233
|
||||||
|
|
||||||
@@ -695,7 +690,7 @@ pub async fn immediate_gc(
|
|||||||
fail::fail_point!("immediate_gc_task_pre");
|
fail::fail_point!("immediate_gc_task_pre");
|
||||||
let result = tenant
|
let result = tenant
|
||||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
||||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
|
||||||
.await;
|
.await;
|
||||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||||
// better once the types support it.
|
// better once the types support it.
|
||||||
@@ -745,9 +740,7 @@ pub async fn immediate_compact(
|
|||||||
async move {
|
async move {
|
||||||
let result = timeline
|
let result = timeline
|
||||||
.compact(&ctx)
|
.compact(&ctx)
|
||||||
.instrument(
|
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||||
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
match task_done.send(result) {
|
match task_done.send(result) {
|
||||||
|
|||||||
@@ -135,7 +135,7 @@
|
|||||||
//! - Initiate upload queue with that [`IndexPart`].
|
//! - Initiate upload queue with that [`IndexPart`].
|
||||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||||
//! and remote state as per [`IndexPart`]. This is done in
|
//! and remote state as per [`IndexPart`]. This is done in
|
||||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||||
//!
|
//!
|
||||||
//! Note that if we crash during file deletion between the index update
|
//! Note that if we crash during file deletion between the index update
|
||||||
//! that removes the file from the list of files, and deleting the remote file,
|
//! that removes the file from the list of files, and deleting the remote file,
|
||||||
@@ -163,8 +163,8 @@
|
|||||||
//! - download their remote [`IndexPart`]s
|
//! - download their remote [`IndexPart`]s
|
||||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||||
//! - initialize the client's upload queue with its `IndexPart`
|
//! - initialize the client's upload queue with its `IndexPart`
|
||||||
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
|
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||||
//! but not present locally
|
//! for layers that are referenced by `IndexPart` but not present locally
|
||||||
//! - schedule uploads for layers that are only present locally.
|
//! - schedule uploads for layers that are only present locally.
|
||||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||||
//! the local filesystem, write the remote metadata to the local filesystem
|
//! the local filesystem, write the remote metadata to the local filesystem
|
||||||
@@ -198,6 +198,8 @@
|
|||||||
//! in remote storage.
|
//! in remote storage.
|
||||||
//! But note that we don't test any of this right now.
|
//! But note that we don't test any of this right now.
|
||||||
//!
|
//!
|
||||||
|
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||||
|
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||||
|
|
||||||
mod delete;
|
mod delete;
|
||||||
mod download;
|
mod download;
|
||||||
@@ -442,8 +444,8 @@ impl RemoteTimelineClient {
|
|||||||
let index_part = download::download_index_part(
|
let index_part = download::download_index_part(
|
||||||
self.conf,
|
self.conf,
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
self.tenant_id,
|
&self.tenant_id,
|
||||||
self.timeline_id,
|
&self.timeline_id,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
@@ -748,25 +750,13 @@ impl RemoteTimelineClient {
|
|||||||
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Have a failpoint that can use the `pause` failpoint action.
|
pausable_failpoint!("persist_deleted_index_part");
|
||||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
|
||||||
if cfg!(feature = "testing") {
|
|
||||||
tokio::task::spawn_blocking({
|
|
||||||
let current = tracing::Span::current();
|
|
||||||
move || {
|
|
||||||
let _entered = current.entered();
|
|
||||||
tracing::info!("at failpoint persist_deleted_index_part");
|
|
||||||
fail::fail_point!("persist_deleted_index_part");
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.expect("spawn_blocking");
|
|
||||||
}
|
|
||||||
upload::upload_index_part(
|
upload::upload_index_part(
|
||||||
self.conf,
|
self.conf,
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
self.tenant_id,
|
&self.tenant_id,
|
||||||
self.timeline_id,
|
&self.timeline_id,
|
||||||
&index_part_with_deleted_at,
|
&index_part_with_deleted_at,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -841,7 +831,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||||
|
|
||||||
let remaining = self
|
let remaining = self
|
||||||
@@ -852,14 +842,16 @@ impl RemoteTimelineClient {
|
|||||||
let remaining: Vec<RemotePath> = remaining
|
let remaining: Vec<RemotePath> = remaining
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
||||||
|
.inspect(|path| {
|
||||||
|
if let Some(name) = path.object_name() {
|
||||||
|
info!(%name, "deleting a file not referenced from index_part.json");
|
||||||
|
} else {
|
||||||
|
warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
if !remaining.is_empty() {
|
if !remaining.is_empty() {
|
||||||
warn!(
|
|
||||||
"Found {} files not bound to index_file.json, proceeding with their deletion",
|
|
||||||
remaining.len()
|
|
||||||
);
|
|
||||||
warn!("About to remove {} files", remaining.len());
|
|
||||||
self.storage_impl.delete_objects(&remaining).await?;
|
self.storage_impl.delete_objects(&remaining).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -868,7 +860,7 @@ impl RemoteTimelineClient {
|
|||||||
debug!("deleting index part");
|
debug!("deleting index part");
|
||||||
self.storage_impl.delete(&index_file_path).await?;
|
self.storage_impl.delete(&index_file_path).await?;
|
||||||
|
|
||||||
info!(deletions_queued, "done deleting, including index_part.json");
|
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -933,11 +925,11 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Assign unique ID to this task
|
// Assign unique ID to this task
|
||||||
upload_queue.task_counter += 1;
|
upload_queue.task_counter += 1;
|
||||||
let task_id = upload_queue.task_counter;
|
let upload_task_id = upload_queue.task_counter;
|
||||||
|
|
||||||
// Add it to the in-progress map
|
// Add it to the in-progress map
|
||||||
let task = Arc::new(UploadTask {
|
let task = Arc::new(UploadTask {
|
||||||
task_id,
|
task_id: upload_task_id,
|
||||||
op: next_op,
|
op: next_op,
|
||||||
retries: AtomicU32::new(0),
|
retries: AtomicU32::new(0),
|
||||||
});
|
});
|
||||||
@@ -947,6 +939,8 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Spawn task to perform the task
|
// Spawn task to perform the task
|
||||||
let self_rc = Arc::clone(self);
|
let self_rc = Arc::clone(self);
|
||||||
|
let tenant_id = self.tenant_id;
|
||||||
|
let timeline_id = self.timeline_id;
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
self.runtime.handle(),
|
self.runtime.handle(),
|
||||||
TaskKind::RemoteUploadTask,
|
TaskKind::RemoteUploadTask,
|
||||||
@@ -958,7 +952,7 @@ impl RemoteTimelineClient {
|
|||||||
self_rc.perform_upload_task(task).await;
|
self_rc.perform_upload_task(task).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Loop back to process next task
|
// Loop back to process next task
|
||||||
@@ -1003,7 +997,7 @@ impl RemoteTimelineClient {
|
|||||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||||
let path = &self
|
let path = &self
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||||
.join(layer_file_name.file_name());
|
.join(layer_file_name.file_name());
|
||||||
upload::upload_timeline_layer(
|
upload::upload_timeline_layer(
|
||||||
self.conf,
|
self.conf,
|
||||||
@@ -1024,8 +1018,8 @@ impl RemoteTimelineClient {
|
|||||||
let res = upload::upload_index_part(
|
let res = upload::upload_index_part(
|
||||||
self.conf,
|
self.conf,
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
self.tenant_id,
|
&self.tenant_id,
|
||||||
self.timeline_id,
|
&self.timeline_id,
|
||||||
index_part,
|
index_part,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
@@ -1044,7 +1038,7 @@ impl RemoteTimelineClient {
|
|||||||
UploadOp::Delete(delete) => {
|
UploadOp::Delete(delete) => {
|
||||||
let path = &self
|
let path = &self
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||||
.join(delete.layer_file_name.file_name());
|
.join(delete.layer_file_name.file_name());
|
||||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
|
|||||||
@@ -19,9 +19,10 @@ pub(super) async fn delete_layer<'a>(
|
|||||||
|
|
||||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||||
|
|
||||||
// XXX: If the deletion fails because the object already didn't exist,
|
// We don't want to print an error if the delete failed if the file has
|
||||||
// it would be good to just issue a warning but consider it success.
|
// already been deleted. Thankfully, in this situation S3 already
|
||||||
// https://github.com/neondatabase/neon/issues/2934
|
// does not yield an error. While OS-provided local file system APIs do yield
|
||||||
|
// errors, we avoid them in the `LocalFs` wrapper.
|
||||||
storage.delete(&path_to_delete).await.with_context(|| {
|
storage.delete(&path_to_delete).await.with_context(|| {
|
||||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ pub async fn download_layer_file<'a>(
|
|||||||
) -> Result<u64, DownloadError> {
|
) -> Result<u64, DownloadError> {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);
|
||||||
|
|
||||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||||
|
|
||||||
@@ -229,11 +229,11 @@ pub async fn list_remote_timelines<'a>(
|
|||||||
pub(super) async fn download_index_part(
|
pub(super) async fn download_index_part(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_id: TenantId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: &TimelineId,
|
||||||
) -> Result<IndexPart, DownloadError> {
|
) -> Result<IndexPart, DownloadError> {
|
||||||
let index_part_path = conf
|
let index_part_path = conf
|
||||||
.metadata_path(timeline_id, tenant_id)
|
.metadata_path(tenant_id, timeline_id)
|
||||||
.with_file_name(IndexPart::FILE_NAME);
|
.with_file_name(IndexPart::FILE_NAME);
|
||||||
let part_storage_path = conf
|
let part_storage_path = conf
|
||||||
.remote_path(&index_part_path)
|
.remote_path(&index_part_path)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use std::path::Path;
|
use std::{io::ErrorKind, path::Path};
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
|
|
||||||
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
||||||
@@ -11,12 +11,14 @@ use utils::id::{TenantId, TimelineId};
|
|||||||
|
|
||||||
use super::index::LayerFileMetadata;
|
use super::index::LayerFileMetadata;
|
||||||
|
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<'a>(
|
pub(super) async fn upload_index_part<'a>(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_id: TenantId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: &TimelineId,
|
||||||
index_part: &'a IndexPart,
|
index_part: &'a IndexPart,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
tracing::trace!("uploading new index part");
|
tracing::trace!("uploading new index part");
|
||||||
@@ -31,7 +33,7 @@ pub(super) async fn upload_index_part<'a>(
|
|||||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||||
|
|
||||||
let index_part_path = conf
|
let index_part_path = conf
|
||||||
.metadata_path(timeline_id, tenant_id)
|
.metadata_path(tenant_id, timeline_id)
|
||||||
.with_file_name(IndexPart::FILE_NAME);
|
.with_file_name(IndexPart::FILE_NAME);
|
||||||
let storage_path = conf.remote_path(&index_part_path)?;
|
let storage_path = conf.remote_path(&index_part_path)?;
|
||||||
|
|
||||||
@@ -56,9 +58,21 @@ pub(super) async fn upload_timeline_layer<'a>(
|
|||||||
});
|
});
|
||||||
let storage_path = conf.remote_path(source_path)?;
|
let storage_path = conf.remote_path(source_path)?;
|
||||||
|
|
||||||
let source_file = fs::File::open(&source_path)
|
let source_file_res = fs::File::open(&source_path).await;
|
||||||
.await
|
let source_file = match source_file_res {
|
||||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
|
Ok(source_file) => source_file,
|
||||||
|
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||||
|
// If we encounter this arm, it wasn't intended, but it's also not
|
||||||
|
// a big problem, if it's because the file was deleted before an
|
||||||
|
// upload. However, a nonexistent file can also be indicative of
|
||||||
|
// something worse, like when a file is scheduled for upload before
|
||||||
|
// it has been written to disk yet.
|
||||||
|
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(e) => Err(e)
|
||||||
|
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
|
||||||
|
};
|
||||||
|
|
||||||
let fs_size = source_file
|
let fs_size = source_file
|
||||||
.metadata()
|
.metadata()
|
||||||
|
|||||||
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
|
|||||||
///
|
///
|
||||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||||
/// is updated on-demand, during the start of this calculation and separate from the
|
/// is updated on-demand, during the start of this calculation and separate from the
|
||||||
/// [`Timeline::latest_gc_cutoff`].
|
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||||
///
|
///
|
||||||
/// For timelines in general:
|
/// For timelines in general:
|
||||||
///
|
///
|
||||||
/// ```ignore
|
/// ```text
|
||||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||||
/// ```
|
/// ```
|
||||||
|
|||||||
@@ -5,16 +5,13 @@ use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
|||||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
||||||
|
|
||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
|
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
|
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
|
||||||
|
|
||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
#[track_caller]
|
#[track_caller]
|
||||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||||
if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
|
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
|
||||||
panic!(
|
panic!("missing extractors: {missing:?}")
|
||||||
"missing extractors: {:?}",
|
|
||||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ pub use inmemory_layer::InMemoryLayer;
|
|||||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||||
pub use remote_layer::RemoteLayer;
|
pub use remote_layer::RemoteLayer;
|
||||||
|
|
||||||
use super::layer_map::BatchedUpdates;
|
use super::timeline::layer_manager::LayerManager;
|
||||||
|
|
||||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||||
where
|
where
|
||||||
@@ -54,13 +54,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
|
|
||||||
where
|
|
||||||
T: PartialEq<T>,
|
|
||||||
{
|
|
||||||
a.start == b.start && a.end == b.end
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
|
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
|
||||||
///
|
///
|
||||||
/// Before first call, you can fill in 'page_img' if you have an older cached
|
/// Before first call, you can fill in 'page_img' if you have an older cached
|
||||||
@@ -169,6 +162,9 @@ impl LayerAccessStats {
|
|||||||
/// The caller is responsible for recording a residence event
|
/// The caller is responsible for recording a residence event
|
||||||
/// using [`record_residence_event`] before calling `latest_activity`.
|
/// using [`record_residence_event`] before calling `latest_activity`.
|
||||||
/// If they don't, [`latest_activity`] will return `None`.
|
/// If they don't, [`latest_activity`] will return `None`.
|
||||||
|
///
|
||||||
|
/// [`record_residence_event`]: Self::record_residence_event
|
||||||
|
/// [`latest_activity`]: Self::latest_activity
|
||||||
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
||||||
LayerAccessStats(Mutex::default())
|
LayerAccessStats(Mutex::default())
|
||||||
}
|
}
|
||||||
@@ -176,8 +172,11 @@ impl LayerAccessStats {
|
|||||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||||
///
|
///
|
||||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||||
|
///
|
||||||
|
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||||
|
/// [`record_residence_event`]: Self::record_residence_event
|
||||||
pub(crate) fn for_loading_layer(
|
pub(crate) fn for_loading_layer(
|
||||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
layer_map_lock_held_witness: &LayerManager,
|
||||||
status: LayerResidenceStatus,
|
status: LayerResidenceStatus,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||||
@@ -194,9 +193,11 @@ impl LayerAccessStats {
|
|||||||
/// The `new_status` is not recorded in `self`.
|
/// The `new_status` is not recorded in `self`.
|
||||||
///
|
///
|
||||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||||
|
///
|
||||||
|
/// [`record_residence_event`]: Self::record_residence_event
|
||||||
pub(crate) fn clone_for_residence_change(
|
pub(crate) fn clone_for_residence_change(
|
||||||
&self,
|
&self,
|
||||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
layer_map_lock_held_witness: &LayerManager,
|
||||||
new_status: LayerResidenceStatus,
|
new_status: LayerResidenceStatus,
|
||||||
) -> LayerAccessStats {
|
) -> LayerAccessStats {
|
||||||
let clone = {
|
let clone = {
|
||||||
@@ -228,7 +229,7 @@ impl LayerAccessStats {
|
|||||||
///
|
///
|
||||||
pub(crate) fn record_residence_event(
|
pub(crate) fn record_residence_event(
|
||||||
&self,
|
&self,
|
||||||
_layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
_layer_map_lock_held_witness: &LayerManager,
|
||||||
status: LayerResidenceStatus,
|
status: LayerResidenceStatus,
|
||||||
reason: LayerResidenceEventReason,
|
reason: LayerResidenceEventReason,
|
||||||
) {
|
) {
|
||||||
@@ -301,11 +302,13 @@ impl LayerAccessStats {
|
|||||||
/// implementation error. This function logs a rate-limited warning in that case.
|
/// implementation error. This function logs a rate-limited warning in that case.
|
||||||
///
|
///
|
||||||
/// TODO: use type system to avoid the need for `fallback`.
|
/// TODO: use type system to avoid the need for `fallback`.
|
||||||
/// The approach in https://github.com/neondatabase/neon/pull/3775
|
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
|
||||||
/// could be used to enforce that a residence event is recorded
|
/// could be used to enforce that a residence event is recorded
|
||||||
/// before a layer is added to the layer map. We could also have
|
/// before a layer is added to the layer map. We could also have
|
||||||
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
||||||
/// that that type can only be produced by inserting into the layer map.
|
/// that that type can only be produced by inserting into the layer map.
|
||||||
|
///
|
||||||
|
/// [`record_residence_event`]: Self::record_residence_event
|
||||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
||||||
let locked = self.0.lock().unwrap();
|
let locked = self.0.lock().unwrap();
|
||||||
let inner = &locked.for_eviction_policy;
|
let inner = &locked.for_eviction_policy;
|
||||||
@@ -330,7 +333,7 @@ impl LayerAccessStats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||||
/// required by [`LayerMap`].
|
/// required by [`LayerMap`](super::layer_map::LayerMap).
|
||||||
///
|
///
|
||||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||||
/// timeline names, because those are known in the context of which the layers
|
/// timeline names, because those are known in the context of which the layers
|
||||||
@@ -377,12 +380,18 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
|||||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returned by [`Layer::iter`]
|
/// Returned by [`PersistentLayer::iter`]
|
||||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||||
|
|
||||||
/// Returned by [`Layer::key_iter`]
|
/// Returned by [`PersistentLayer::key_iter`]
|
||||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||||
|
|
||||||
|
/// Get a layer descriptor from a layer.
|
||||||
|
pub trait AsLayerDesc {
|
||||||
|
/// Get the layer descriptor.
|
||||||
|
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||||
|
}
|
||||||
|
|
||||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||||
/// range of LSNs.
|
/// range of LSNs.
|
||||||
///
|
///
|
||||||
@@ -396,10 +405,8 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send
|
|||||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||||
/// LSN.
|
/// LSN.
|
||||||
pub trait PersistentLayer: Layer {
|
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||||
/// Get the layer descriptor.
|
/// Identify the tenant this layer belongs to
|
||||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
|
||||||
|
|
||||||
fn get_tenant_id(&self) -> TenantId {
|
fn get_tenant_id(&self) -> TenantId {
|
||||||
self.layer_desc().tenant_id
|
self.layer_desc().tenant_id
|
||||||
}
|
}
|
||||||
@@ -465,119 +472,32 @@ pub fn downcast_remote_layer(
|
|||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
impl From<DeltaFileName> for PersistentLayerDesc {
|
||||||
///
|
|
||||||
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
|
||||||
/// LayerDescriptor.
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct LayerDescriptor {
|
|
||||||
base: PersistentLayerDesc,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<PersistentLayerDesc> for LayerDescriptor {
|
|
||||||
fn from(base: PersistentLayerDesc) -> Self {
|
|
||||||
Self { base }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Layer for LayerDescriptor {
|
|
||||||
fn get_value_reconstruct_data(
|
|
||||||
&self,
|
|
||||||
_key: Key,
|
|
||||||
_lsn_range: Range<Lsn>,
|
|
||||||
_reconstruct_data: &mut ValueReconstructState,
|
|
||||||
_ctx: &RequestContext,
|
|
||||||
) -> Result<ValueReconstructResult> {
|
|
||||||
todo!("This method shouldn't be part of the Layer trait")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
|
||||||
fn get_key_range(&self) -> Range<Key> {
|
|
||||||
self.layer_desc().key_range.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
|
||||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
|
||||||
self.layer_desc().lsn_range.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
|
||||||
fn is_incremental(&self) -> bool {
|
|
||||||
self.layer_desc().is_incremental
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
|
||||||
impl std::fmt::Display for LayerDescriptor {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{}", self.layer_desc().short_id())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PersistentLayer for LayerDescriptor {
|
|
||||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
|
||||||
&self.base
|
|
||||||
}
|
|
||||||
|
|
||||||
fn local_path(&self) -> Option<PathBuf> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn access_stats(&self) -> &LayerAccessStats {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<DeltaFileName> for LayerDescriptor {
|
|
||||||
fn from(value: DeltaFileName) -> Self {
|
fn from(value: DeltaFileName) -> Self {
|
||||||
LayerDescriptor {
|
PersistentLayerDesc::new_delta(
|
||||||
base: PersistentLayerDesc::new_delta(
|
TenantId::from_array([0; 16]),
|
||||||
TenantId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
TimelineId::from_array([0; 16]),
|
value.key_range,
|
||||||
value.key_range,
|
value.lsn_range,
|
||||||
value.lsn_range,
|
233,
|
||||||
233,
|
)
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<ImageFileName> for LayerDescriptor {
|
impl From<ImageFileName> for PersistentLayerDesc {
|
||||||
fn from(value: ImageFileName) -> Self {
|
fn from(value: ImageFileName) -> Self {
|
||||||
LayerDescriptor {
|
PersistentLayerDesc::new_img(
|
||||||
base: PersistentLayerDesc::new_img(
|
TenantId::from_array([0; 16]),
|
||||||
TenantId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
TimelineId::from_array([0; 16]),
|
value.key_range,
|
||||||
value.key_range,
|
value.lsn,
|
||||||
value.lsn,
|
false,
|
||||||
false,
|
233,
|
||||||
233,
|
)
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<LayerFileName> for LayerDescriptor {
|
impl From<LayerFileName> for PersistentLayerDesc {
|
||||||
fn from(value: LayerFileName) -> Self {
|
fn from(value: LayerFileName) -> Self {
|
||||||
match value {
|
match value {
|
||||||
LayerFileName::Delta(d) => Self::from(d),
|
LayerFileName::Delta(d) => Self::from(d),
|
||||||
|
|||||||
@@ -7,14 +7,18 @@
|
|||||||
//! must be page images or WAL records with the 'will_init' flag set, so that
|
//! must be page images or WAL records with the 'will_init' flag set, so that
|
||||||
//! they can be replayed without referring to an older page version.
|
//! they can be replayed without referring to an older page version.
|
||||||
//!
|
//!
|
||||||
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
|
//! The delta files are stored in `timelines/<timeline_id>` directory. Currently,
|
||||||
//! there are no subdirectories, and each delta file is named like this:
|
//! there are no subdirectories, and each delta file is named like this:
|
||||||
//!
|
//!
|
||||||
//! <key start>-<key end>__<start LSN>-<end LSN
|
//! ```text
|
||||||
|
//! <key start>-<key end>__<start LSN>-<end LSN>
|
||||||
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! For example:
|
//! For example:
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||||
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Every delta file consists of three parts: "summary", "index", and
|
//! Every delta file consists of three parts: "summary", "index", and
|
||||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
//! "values". The summary is a fixed size header at the beginning of the file,
|
||||||
@@ -56,8 +60,8 @@ use utils::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||||
PathOrConf, PersistentLayerDesc,
|
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||||
};
|
};
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -403,11 +407,13 @@ impl std::fmt::Display for DeltaLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentLayer for DeltaLayer {
|
impl AsLayerDesc for DeltaLayer {
|
||||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||||
&self.desc
|
&self.desc
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PersistentLayer for DeltaLayer {
|
||||||
fn local_path(&self) -> Option<PathBuf> {
|
fn local_path(&self) -> Option<PathBuf> {
|
||||||
Some(self.path())
|
Some(self.path())
|
||||||
}
|
}
|
||||||
@@ -459,22 +465,22 @@ impl PersistentLayer for DeltaLayer {
|
|||||||
impl DeltaLayer {
|
impl DeltaLayer {
|
||||||
fn path_for(
|
fn path_for(
|
||||||
path_or_conf: &PathOrConf,
|
path_or_conf: &PathOrConf,
|
||||||
timeline_id: TimelineId,
|
tenant_id: &TenantId,
|
||||||
tenant_id: TenantId,
|
timeline_id: &TimelineId,
|
||||||
fname: &DeltaFileName,
|
fname: &DeltaFileName,
|
||||||
) -> PathBuf {
|
) -> PathBuf {
|
||||||
match path_or_conf {
|
match path_or_conf {
|
||||||
PathOrConf::Path(path) => path.clone(),
|
PathOrConf::Path(path) => path.clone(),
|
||||||
PathOrConf::Conf(conf) => conf
|
PathOrConf::Conf(conf) => conf
|
||||||
.timeline_path(&timeline_id, &tenant_id)
|
.timeline_path(tenant_id, timeline_id)
|
||||||
.join(fname.to_string()),
|
.join(fname.to_string()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn temp_path_for(
|
fn temp_path_for(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
timeline_id: TimelineId,
|
tenant_id: &TenantId,
|
||||||
tenant_id: TenantId,
|
timeline_id: &TimelineId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: &Range<Lsn>,
|
lsn_range: &Range<Lsn>,
|
||||||
) -> PathBuf {
|
) -> PathBuf {
|
||||||
@@ -484,7 +490,7 @@ impl DeltaLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(&timeline_id, &tenant_id).join(format!(
|
conf.timeline_path(tenant_id, timeline_id).join(format!(
|
||||||
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
||||||
key_start,
|
key_start,
|
||||||
u64::from(lsn_range.start),
|
u64::from(lsn_range.start),
|
||||||
@@ -606,8 +612,8 @@ impl DeltaLayer {
|
|||||||
pub fn path(&self) -> PathBuf {
|
pub fn path(&self) -> PathBuf {
|
||||||
Self::path_for(
|
Self::path_for(
|
||||||
&self.path_or_conf,
|
&self.path_or_conf,
|
||||||
self.desc.timeline_id,
|
&self.desc.tenant_id,
|
||||||
self.desc.tenant_id,
|
&self.desc.timeline_id,
|
||||||
&self.layer_name(),
|
&self.layer_name(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -655,7 +661,7 @@ impl DeltaLayerWriterInner {
|
|||||||
//
|
//
|
||||||
// Note: This overwrites any existing file. There shouldn't be any.
|
// Note: This overwrites any existing file. There shouldn't be any.
|
||||||
// FIXME: throw an error instead?
|
// FIXME: throw an error instead?
|
||||||
let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range);
|
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
|
||||||
|
|
||||||
let mut file = VirtualFile::create(&path)?;
|
let mut file = VirtualFile::create(&path)?;
|
||||||
// make room for the header block
|
// make room for the header block
|
||||||
@@ -770,8 +776,8 @@ impl DeltaLayerWriterInner {
|
|||||||
// FIXME: throw an error instead?
|
// FIXME: throw an error instead?
|
||||||
let final_path = DeltaLayer::path_for(
|
let final_path = DeltaLayer::path_for(
|
||||||
&PathOrConf::Conf(self.conf),
|
&PathOrConf::Conf(self.conf),
|
||||||
self.timeline_id,
|
&self.tenant_id,
|
||||||
self.tenant_id,
|
&self.timeline_id,
|
||||||
&DeltaFileName {
|
&DeltaFileName {
|
||||||
key_range: self.key_start..key_end,
|
key_range: self.key_start..key_end,
|
||||||
lsn_range: self.lsn_range,
|
lsn_range: self.lsn_range,
|
||||||
@@ -798,7 +804,7 @@ impl DeltaLayerWriterInner {
|
|||||||
///
|
///
|
||||||
/// # Note
|
/// # Note
|
||||||
///
|
///
|
||||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||||
/// possible for the writer to drop before `finish` is actually called. So this
|
/// possible for the writer to drop before `finish` is actually called. So this
|
||||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||||
|
|||||||
@@ -57,8 +57,9 @@ impl Ord for DeltaFileName {
|
|||||||
|
|
||||||
/// Represents the filename of a DeltaLayer
|
/// Represents the filename of a DeltaLayer
|
||||||
///
|
///
|
||||||
|
/// ```text
|
||||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||||
///
|
/// ```
|
||||||
impl DeltaFileName {
|
impl DeltaFileName {
|
||||||
///
|
///
|
||||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||||
@@ -162,7 +163,9 @@ impl ImageFileName {
|
|||||||
///
|
///
|
||||||
/// Represents the filename of an ImageLayer
|
/// Represents the filename of an ImageLayer
|
||||||
///
|
///
|
||||||
|
/// ```text
|
||||||
/// <key start>-<key end>__<LSN>
|
/// <key start>-<key end>__<LSN>
|
||||||
|
/// ```
|
||||||
impl ImageFileName {
|
impl ImageFileName {
|
||||||
///
|
///
|
||||||
/// Parse a string as an image file name. Returns None if the filename does not
|
/// Parse a string as an image file name. Returns None if the filename does not
|
||||||
|
|||||||
@@ -7,11 +7,15 @@
|
|||||||
//! timelines/<timeline_id> directory. Currently, there are no
|
//! timelines/<timeline_id> directory. Currently, there are no
|
||||||
//! subdirectories, and each image layer file is named like this:
|
//! subdirectories, and each image layer file is named like this:
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! <key start>-<key end>__<LSN>
|
//! <key start>-<key end>__<LSN>
|
||||||
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! For example:
|
//! For example:
|
||||||
//!
|
//!
|
||||||
|
//! ```text
|
||||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||||
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Every image layer file consists of three parts: "summary",
|
//! Every image layer file consists of three parts: "summary",
|
||||||
//! "index", and "values". The summary is a fixed size header at the
|
//! "index", and "values". The summary is a fixed size header at the
|
||||||
@@ -53,7 +57,9 @@ use utils::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use super::filename::ImageFileName;
|
use super::filename::ImageFileName;
|
||||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
|
use super::{
|
||||||
|
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||||
|
};
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Header stored in the beginning of the file
|
/// Header stored in the beginning of the file
|
||||||
@@ -241,11 +247,13 @@ impl std::fmt::Display for ImageLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentLayer for ImageLayer {
|
impl AsLayerDesc for ImageLayer {
|
||||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||||
&self.desc
|
&self.desc
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PersistentLayer for ImageLayer {
|
||||||
fn local_path(&self) -> Option<PathBuf> {
|
fn local_path(&self) -> Option<PathBuf> {
|
||||||
Some(self.path())
|
Some(self.path())
|
||||||
}
|
}
|
||||||
@@ -288,7 +296,7 @@ impl ImageLayer {
|
|||||||
match path_or_conf {
|
match path_or_conf {
|
||||||
PathOrConf::Path(path) => path.to_path_buf(),
|
PathOrConf::Path(path) => path.to_path_buf(),
|
||||||
PathOrConf::Conf(conf) => conf
|
PathOrConf::Conf(conf) => conf
|
||||||
.timeline_path(&timeline_id, &tenant_id)
|
.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(fname.to_string()),
|
.join(fname.to_string()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -305,7 +313,7 @@ impl ImageLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(&timeline_id, &tenant_id)
|
conf.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -656,7 +664,7 @@ impl ImageLayerWriterInner {
|
|||||||
///
|
///
|
||||||
/// # Note
|
/// # Note
|
||||||
///
|
///
|
||||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||||
/// possible for the writer to drop before `finish` is actually called. So this
|
/// possible for the writer to drop before `finish` is actually called. So this
|
||||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||||
|
|||||||
@@ -4,9 +4,9 @@
|
|||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
use crate::tenant::layer_map::BatchedUpdates;
|
|
||||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||||
|
use crate::tenant::timeline::layer_manager::LayerManager;
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use pageserver_api::models::HistoricLayerInfo;
|
use pageserver_api::models::HistoricLayerInfo;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
@@ -20,12 +20,12 @@ use utils::{
|
|||||||
|
|
||||||
use super::filename::{DeltaFileName, ImageFileName};
|
use super::filename::{DeltaFileName, ImageFileName};
|
||||||
use super::{
|
use super::{
|
||||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||||
/// [`crate::storage_layer::DeltaLayer`].
|
/// [`DeltaLayer`](super::DeltaLayer).
|
||||||
///
|
///
|
||||||
/// RemoteLayer might be downloaded on-demand during operations which are
|
/// RemoteLayer might be downloaded on-demand during operations which are
|
||||||
/// allowed download remote layers and during which, it gets replaced with a
|
/// allowed download remote layers and during which, it gets replaced with a
|
||||||
@@ -50,6 +50,8 @@ pub struct RemoteLayer {
|
|||||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||||
/// `Timeline::download_remote_layer`, which also logs.
|
/// `Timeline::download_remote_layer`, which also logs.
|
||||||
|
///
|
||||||
|
/// [`ongoing_download`]: Self::ongoing_download
|
||||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,11 +117,13 @@ impl std::fmt::Display for RemoteLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentLayer for RemoteLayer {
|
impl AsLayerDesc for RemoteLayer {
|
||||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||||
&self.desc
|
&self.desc
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PersistentLayer for RemoteLayer {
|
||||||
fn local_path(&self) -> Option<PathBuf> {
|
fn local_path(&self) -> Option<PathBuf> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@@ -222,7 +226,7 @@ impl RemoteLayer {
|
|||||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||||
pub fn create_downloaded_layer(
|
pub fn create_downloaded_layer(
|
||||||
&self,
|
&self,
|
||||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
layer_map_lock_held_witness: &LayerManager,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Arc<dyn PersistentLayer> {
|
) -> Arc<dyn PersistentLayer> {
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
//!
|
|
||||||
|
|
||||||
mod eviction_task;
|
mod eviction_task;
|
||||||
|
pub mod layer_manager;
|
||||||
mod logical_size;
|
mod logical_size;
|
||||||
pub mod span;
|
pub mod span;
|
||||||
pub mod uninit;
|
pub mod uninit;
|
||||||
@@ -82,15 +81,15 @@ use crate::{is_temporary, task_mgr};
|
|||||||
|
|
||||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||||
use self::eviction_task::EvictionTaskTimelineState;
|
use self::eviction_task::EvictionTaskTimelineState;
|
||||||
|
use self::layer_manager::LayerManager;
|
||||||
use self::logical_size::LogicalSize;
|
use self::logical_size::LogicalSize;
|
||||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||||
|
|
||||||
use super::config::TenantConf;
|
use super::config::TenantConf;
|
||||||
use super::layer_map::BatchedUpdates;
|
|
||||||
use super::remote_timeline_client::index::IndexPart;
|
use super::remote_timeline_client::index::IndexPart;
|
||||||
use super::remote_timeline_client::RemoteTimelineClient;
|
use super::remote_timeline_client::RemoteTimelineClient;
|
||||||
use super::storage_layer::{
|
use super::storage_layer::{
|
||||||
DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey,
|
AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
@@ -124,80 +123,6 @@ impl PartialOrd for Hole {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LayerFileManager(HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>);
|
|
||||||
|
|
||||||
impl LayerFileManager {
|
|
||||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
|
||||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
|
||||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
|
||||||
self.0
|
|
||||||
.get(&desc.key())
|
|
||||||
.with_context(|| format!("get layer from desc: {}", desc.filename()))
|
|
||||||
.expect("not found")
|
|
||||||
.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn insert(&mut self, layer: Arc<dyn PersistentLayer>) {
|
|
||||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
|
||||||
if present.is_some() && cfg!(debug_assertions) {
|
|
||||||
panic!("overwriting a layer: {:?}", layer.layer_desc())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self(HashMap::new())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn remove(&mut self, layer: Arc<dyn PersistentLayer>) {
|
|
||||||
let present = self.0.remove(&layer.layer_desc().key());
|
|
||||||
if present.is_none() && cfg!(debug_assertions) {
|
|
||||||
panic!(
|
|
||||||
"removing layer that is not present in layer mapping: {:?}",
|
|
||||||
layer.layer_desc()
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn replace_and_verify(
|
|
||||||
&mut self,
|
|
||||||
expected: Arc<dyn PersistentLayer>,
|
|
||||||
new: Arc<dyn PersistentLayer>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let key = expected.layer_desc().key();
|
|
||||||
let other = new.layer_desc().key();
|
|
||||||
|
|
||||||
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
|
||||||
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
|
||||||
|
|
||||||
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
|
||||||
"layermap-replace-notfound"
|
|
||||||
));
|
|
||||||
|
|
||||||
anyhow::ensure!(
|
|
||||||
key == other,
|
|
||||||
"expected and new layer have different keys: {key:?} != {other:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
anyhow::ensure!(
|
|
||||||
expected_l0 == new_l0,
|
|
||||||
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
|
||||||
);
|
|
||||||
|
|
||||||
if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) {
|
|
||||||
anyhow::ensure!(
|
|
||||||
compare_arced_layers(&expected, layer),
|
|
||||||
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
|
||||||
expected = Arc::as_ptr(&expected),
|
|
||||||
new = Arc::as_ptr(layer),
|
|
||||||
);
|
|
||||||
*layer = new;
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
anyhow::bail!("layer was not found");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||||
/// Can be removed after all refactors are done.
|
/// Can be removed after all refactors are done.
|
||||||
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||||
@@ -209,7 +134,6 @@ fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
|||||||
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||||
drop(rlock)
|
drop(rlock)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Timeline {
|
pub struct Timeline {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||||
@@ -238,7 +162,7 @@ pub struct Timeline {
|
|||||||
///
|
///
|
||||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||||
pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,
|
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
|
||||||
|
|
||||||
/// Set of key ranges which should be covered by image layers to
|
/// Set of key ranges which should be covered by image layers to
|
||||||
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
||||||
@@ -259,7 +183,7 @@ pub struct Timeline {
|
|||||||
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
|
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
|
||||||
|
|
||||||
/// Remote storage client.
|
/// Remote storage client.
|
||||||
/// See [`storage_sync`] module comment for details.
|
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
|
||||||
pub remote_client: Option<Arc<RemoteTimelineClient>>,
|
pub remote_client: Option<Arc<RemoteTimelineClient>>,
|
||||||
|
|
||||||
// What page versions do we hold in the repository? If we get a
|
// What page versions do we hold in the repository? If we get a
|
||||||
@@ -316,6 +240,8 @@ pub struct Timeline {
|
|||||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||||
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
||||||
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||||
|
///
|
||||||
|
/// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
|
||||||
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
||||||
|
|
||||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||||
@@ -589,7 +515,7 @@ impl Timeline {
|
|||||||
/// Hence, the result **does not represent local filesystem usage**.
|
/// Hence, the result **does not represent local filesystem usage**.
|
||||||
pub async fn layer_size_sum(&self) -> u64 {
|
pub async fn layer_size_sum(&self) -> u64 {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layer_map, _) = &*guard;
|
let layer_map = guard.layer_map();
|
||||||
let mut size = 0;
|
let mut size = 0;
|
||||||
for l in layer_map.iter_historic_layers() {
|
for l in layer_map.iter_historic_layers() {
|
||||||
size += l.file_size();
|
size += l.file_size();
|
||||||
@@ -900,7 +826,7 @@ impl Timeline {
|
|||||||
let last_lsn = self.get_last_record_lsn();
|
let last_lsn = self.get_last_record_lsn();
|
||||||
let open_layer_size = {
|
let open_layer_size = {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, _) = &*guard;
|
let layers = guard.layer_map();
|
||||||
let Some(open_layer) = layers.open_layer.as_ref() else {
|
let Some(open_layer) = layers.open_layer.as_ref() else {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
};
|
};
|
||||||
@@ -1032,7 +958,7 @@ impl Timeline {
|
|||||||
|
|
||||||
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layer_map, mapping) = &*guard;
|
let layer_map = guard.layer_map();
|
||||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||||
if let Some(open_layer) = &layer_map.open_layer {
|
if let Some(open_layer) = &layer_map.open_layer {
|
||||||
in_memory_layers.push(open_layer.info());
|
in_memory_layers.push(open_layer.info());
|
||||||
@@ -1043,7 +969,7 @@ impl Timeline {
|
|||||||
|
|
||||||
let mut historic_layers = Vec::new();
|
let mut historic_layers = Vec::new();
|
||||||
for historic_layer in layer_map.iter_historic_layers() {
|
for historic_layer in layer_map.iter_historic_layers() {
|
||||||
let historic_layer = mapping.get_from_desc(&historic_layer);
|
let historic_layer = guard.get_from_desc(&historic_layer);
|
||||||
historic_layers.push(historic_layer.info(reset));
|
historic_layers.push(historic_layer.info(reset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1053,10 +979,14 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||||
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
|
let Some(layer) = self.find_layer(layer_file_name).await else {
|
||||||
let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) };
|
return Ok(None);
|
||||||
|
};
|
||||||
|
let Some(remote_layer) = layer.downcast_remote_layer() else {
|
||||||
|
return Ok(Some(false));
|
||||||
|
};
|
||||||
if self.remote_client.is_none() {
|
if self.remote_client.is_none() {
|
||||||
return Ok(Some(false));
|
return Ok(Some(false));
|
||||||
}
|
}
|
||||||
@@ -1065,10 +995,12 @@ impl Timeline {
|
|||||||
Ok(Some(true))
|
Ok(Some(true))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Like [`evict_layer_batch`], but for just one layer.
|
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
|
||||||
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
||||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
|
let Some(local_layer) = self.find_layer(layer_file_name).await else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
let remote_client = self
|
let remote_client = self
|
||||||
.remote_client
|
.remote_client
|
||||||
.as_ref()
|
.as_ref()
|
||||||
@@ -1089,9 +1021,9 @@ impl Timeline {
|
|||||||
|
|
||||||
/// Evict a batch of layers.
|
/// Evict a batch of layers.
|
||||||
///
|
///
|
||||||
/// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
|
/// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
|
||||||
///
|
///
|
||||||
/// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
|
/// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
|
||||||
pub async fn evict_layers(
|
pub async fn evict_layers(
|
||||||
&self,
|
&self,
|
||||||
_: &GenericRemoteStorage,
|
_: &GenericRemoteStorage,
|
||||||
@@ -1154,27 +1086,18 @@ impl Timeline {
|
|||||||
|
|
||||||
// start the batch update
|
// start the batch update
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layer_map, mapping) = &mut *guard;
|
|
||||||
let mut batch_updates = layer_map.batch_update();
|
|
||||||
|
|
||||||
let mut results = Vec::with_capacity(layers_to_evict.len());
|
let mut results = Vec::with_capacity(layers_to_evict.len());
|
||||||
|
|
||||||
for l in layers_to_evict.iter() {
|
for l in layers_to_evict.iter() {
|
||||||
let res = if cancel.is_cancelled() {
|
let res = if cancel.is_cancelled() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(self.evict_layer_batch_impl(
|
Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut guard))
|
||||||
&layer_removal_guard,
|
|
||||||
l,
|
|
||||||
&mut batch_updates,
|
|
||||||
mapping,
|
|
||||||
))
|
|
||||||
};
|
};
|
||||||
results.push(res);
|
results.push(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
// commit the updates & release locks
|
// commit the updates & release locks
|
||||||
batch_updates.flush();
|
|
||||||
drop_wlock(guard);
|
drop_wlock(guard);
|
||||||
drop(layer_removal_guard);
|
drop(layer_removal_guard);
|
||||||
|
|
||||||
@@ -1186,8 +1109,7 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||||
local_layer: &Arc<dyn PersistentLayer>,
|
local_layer: &Arc<dyn PersistentLayer>,
|
||||||
batch_updates: &mut BatchedUpdates<'_>,
|
layer_mgr: &mut LayerManager,
|
||||||
mapping: &mut LayerFileManager,
|
|
||||||
) -> anyhow::Result<bool> {
|
) -> anyhow::Result<bool> {
|
||||||
if local_layer.is_remote_layer() {
|
if local_layer.is_remote_layer() {
|
||||||
// TODO(issue #3851): consider returning an err here instead of false,
|
// TODO(issue #3851): consider returning an err here instead of false,
|
||||||
@@ -1223,7 +1145,7 @@ impl Timeline {
|
|||||||
&layer_metadata,
|
&layer_metadata,
|
||||||
local_layer
|
local_layer
|
||||||
.access_stats()
|
.access_stats()
|
||||||
.clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
|
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||||
),
|
),
|
||||||
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
|
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
@@ -1232,13 +1154,13 @@ impl Timeline {
|
|||||||
&layer_metadata,
|
&layer_metadata,
|
||||||
local_layer
|
local_layer
|
||||||
.access_stats()
|
.access_stats()
|
||||||
.clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
|
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||||
),
|
),
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
|
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
|
||||||
|
|
||||||
let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) {
|
let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||||
@@ -1409,10 +1331,7 @@ impl Timeline {
|
|||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_id,
|
tenant_id,
|
||||||
pg_version,
|
pg_version,
|
||||||
layers: Arc::new(tokio::sync::RwLock::new((
|
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||||
LayerMap::default(),
|
|
||||||
LayerFileManager::new(),
|
|
||||||
))),
|
|
||||||
wanted_image_layers: Mutex::new(None),
|
wanted_image_layers: Mutex::new(None),
|
||||||
|
|
||||||
walredo_mgr,
|
walredo_mgr,
|
||||||
@@ -1541,7 +1460,7 @@ impl Timeline {
|
|||||||
*flush_loop_state = FlushLoopState::Exited;
|
*flush_loop_state = FlushLoopState::Exited;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
.instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1597,7 +1516,7 @@ impl Timeline {
|
|||||||
let mut layers = self.layers.try_write().expect(
|
let mut layers = self.layers.try_write().expect(
|
||||||
"in the context where we call this function, no other task has access to the object",
|
"in the context where we call this function, no other task has access to the object",
|
||||||
);
|
);
|
||||||
layers.0.next_open_layer_at = Some(Lsn(start_lsn.0));
|
layers.initialize_empty(Lsn(start_lsn.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -1605,18 +1524,18 @@ impl Timeline {
|
|||||||
///
|
///
|
||||||
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
|
||||||
let mut updates = layers.batch_update();
|
|
||||||
let mut num_layers = 0;
|
let mut num_layers = 0;
|
||||||
|
|
||||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||||
|
|
||||||
// Scan timeline directory and create ImageFileName and DeltaFilename
|
// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||||
// structs representing all files on disk
|
// structs representing all files on disk
|
||||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||||
// total size of layer files in the current timeline directory
|
// total size of layer files in the current timeline directory
|
||||||
let mut total_physical_size = 0;
|
let mut total_physical_size = 0;
|
||||||
|
|
||||||
|
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
|
||||||
|
|
||||||
for direntry in fs::read_dir(timeline_path)? {
|
for direntry in fs::read_dir(timeline_path)? {
|
||||||
let direntry = direntry?;
|
let direntry = direntry?;
|
||||||
let direntry_path = direntry.path();
|
let direntry_path = direntry.path();
|
||||||
@@ -1643,12 +1562,12 @@ impl Timeline {
|
|||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
&imgfilename,
|
&imgfilename,
|
||||||
file_size,
|
file_size,
|
||||||
LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
|
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||||
);
|
);
|
||||||
|
|
||||||
trace!("found layer {}", layer.path().display());
|
trace!("found layer {}", layer.path().display());
|
||||||
total_physical_size += file_size;
|
total_physical_size += file_size;
|
||||||
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
|
loaded_layers.push(Arc::new(layer));
|
||||||
num_layers += 1;
|
num_layers += 1;
|
||||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||||
// Create a DeltaLayer struct for each delta file.
|
// Create a DeltaLayer struct for each delta file.
|
||||||
@@ -1675,12 +1594,12 @@ impl Timeline {
|
|||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
&deltafilename,
|
&deltafilename,
|
||||||
file_size,
|
file_size,
|
||||||
LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
|
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||||
);
|
);
|
||||||
|
|
||||||
trace!("found layer {}", layer.path().display());
|
trace!("found layer {}", layer.path().display());
|
||||||
total_physical_size += file_size;
|
total_physical_size += file_size;
|
||||||
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
|
loaded_layers.push(Arc::new(layer));
|
||||||
num_layers += 1;
|
num_layers += 1;
|
||||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||||
// ignore these
|
// ignore these
|
||||||
@@ -1706,8 +1625,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
updates.flush();
|
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
|
||||||
layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||||
@@ -1735,8 +1653,9 @@ impl Timeline {
|
|||||||
// We're holding a layer map lock for a while but this
|
// We're holding a layer map lock for a while but this
|
||||||
// method is only called during init so it's fine.
|
// method is only called during init so it's fine.
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layer_map, mapping) = &mut *guard;
|
|
||||||
let mut updates = layer_map.batch_update();
|
let mut corrupted_local_layers = Vec::new();
|
||||||
|
let mut added_remote_layers = Vec::new();
|
||||||
for remote_layer_name in &index_part.timeline_layers {
|
for remote_layer_name in &index_part.timeline_layers {
|
||||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||||
|
|
||||||
@@ -1780,7 +1699,7 @@ impl Timeline {
|
|||||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||||
} else {
|
} else {
|
||||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||||
self.remove_historic_layer(local_layer, &mut updates, mapping);
|
corrupted_local_layers.push(local_layer);
|
||||||
// fall-through to adding the remote layer
|
// fall-through to adding the remote layer
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -1812,14 +1731,10 @@ impl Timeline {
|
|||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
imgfilename,
|
imgfilename,
|
||||||
&remote_layer_metadata,
|
&remote_layer_metadata,
|
||||||
LayerAccessStats::for_loading_layer(
|
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||||
&updates,
|
|
||||||
LayerResidenceStatus::Evicted,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
let remote_layer = Arc::new(remote_layer);
|
let remote_layer = Arc::new(remote_layer);
|
||||||
|
added_remote_layers.push(remote_layer);
|
||||||
self.insert_historic_layer(remote_layer, &mut updates, mapping);
|
|
||||||
}
|
}
|
||||||
LayerFileName::Delta(deltafilename) => {
|
LayerFileName::Delta(deltafilename) => {
|
||||||
// Create a RemoteLayer for the delta file.
|
// Create a RemoteLayer for the delta file.
|
||||||
@@ -1840,18 +1755,14 @@ impl Timeline {
|
|||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
deltafilename,
|
deltafilename,
|
||||||
&remote_layer_metadata,
|
&remote_layer_metadata,
|
||||||
LayerAccessStats::for_loading_layer(
|
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||||
&updates,
|
|
||||||
LayerResidenceStatus::Evicted,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
let remote_layer = Arc::new(remote_layer);
|
let remote_layer = Arc::new(remote_layer);
|
||||||
self.insert_historic_layer(remote_layer, &mut updates, mapping);
|
added_remote_layers.push(remote_layer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
|
||||||
updates.flush();
|
|
||||||
Ok(local_only_layers)
|
Ok(local_only_layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1866,7 +1777,7 @@ impl Timeline {
|
|||||||
/// 3. Schedule upload of local-only layer files (which will then also update the remote
|
/// 3. Schedule upload of local-only layer files (which will then also update the remote
|
||||||
/// IndexPart to include the new layer files).
|
/// IndexPart to include the new layer files).
|
||||||
///
|
///
|
||||||
/// Refer to the `storage_sync` module comment for more context.
|
/// Refer to the [`remote_timeline_client`] module comment for more context.
|
||||||
///
|
///
|
||||||
/// # TODO
|
/// # TODO
|
||||||
/// May be a bit cleaner to do things based on populated remote client,
|
/// May be a bit cleaner to do things based on populated remote client,
|
||||||
@@ -1887,10 +1798,10 @@ impl Timeline {
|
|||||||
|
|
||||||
let local_layers = {
|
let local_layers = {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
layers
|
layers
|
||||||
.iter_historic_layers()
|
.iter_historic_layers()
|
||||||
.map(|l| (l.filename(), mapping.get_from_desc(&l)))
|
.map(|l| (l.filename(), guard.get_from_desc(&l)))
|
||||||
.collect::<HashMap<_, _>>()
|
.collect::<HashMap<_, _>>()
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -2208,7 +2119,7 @@ impl Timeline {
|
|||||||
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
|
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
|
||||||
if !self
|
if !self
|
||||||
.conf
|
.conf
|
||||||
.metadata_path(self.timeline_id, self.tenant_id)
|
.metadata_path(&self.tenant_id, &self.timeline_id)
|
||||||
.exists()
|
.exists()
|
||||||
{
|
{
|
||||||
error!("timeline-calculate-logical-size-pre metadata file does not exist")
|
error!("timeline-calculate-logical-size-pre metadata file does not exist")
|
||||||
@@ -2264,70 +2175,15 @@ impl Timeline {
|
|||||||
|
|
||||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
|
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||||
for historic_layer in layers.iter_historic_layers() {
|
|
||||||
let historic_layer_name = historic_layer.filename().file_name();
|
let historic_layer_name = historic_layer.filename().file_name();
|
||||||
if layer_file_name == historic_layer_name {
|
if layer_file_name == historic_layer_name {
|
||||||
return Some(mapping.get_from_desc(&historic_layer));
|
return Some(guard.get_from_desc(&historic_layer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future
|
|
||||||
/// after we introduce `LayerMapManager`.
|
|
||||||
fn insert_historic_layer(
|
|
||||||
&self,
|
|
||||||
layer: Arc<dyn PersistentLayer>,
|
|
||||||
updates: &mut BatchedUpdates<'_>,
|
|
||||||
mapping: &mut LayerFileManager,
|
|
||||||
) {
|
|
||||||
updates.insert_historic(layer.layer_desc().clone());
|
|
||||||
mapping.insert(layer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future
|
|
||||||
/// after we introduce `LayerMapManager`.
|
|
||||||
fn remove_historic_layer(
|
|
||||||
&self,
|
|
||||||
layer: Arc<dyn PersistentLayer>,
|
|
||||||
updates: &mut BatchedUpdates<'_>,
|
|
||||||
mapping: &mut LayerFileManager,
|
|
||||||
) {
|
|
||||||
updates.remove_historic(layer.layer_desc().clone());
|
|
||||||
mapping.remove(layer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Removes the layer from local FS (if present) and from memory.
|
|
||||||
/// Remote storage is not affected by this operation.
|
|
||||||
fn delete_historic_layer(
|
|
||||||
&self,
|
|
||||||
// we cannot remove layers otherwise, since gc and compaction will race
|
|
||||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
|
||||||
layer: Arc<PersistentLayerDesc>,
|
|
||||||
updates: &mut BatchedUpdates<'_>,
|
|
||||||
mapping: &mut LayerFileManager,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let layer = mapping.get_from_desc(&layer);
|
|
||||||
if !layer.is_remote_layer() {
|
|
||||||
layer.delete_resident_layer_file()?;
|
|
||||||
let layer_file_size = layer.file_size();
|
|
||||||
self.metrics
|
|
||||||
.resident_physical_size_gauge
|
|
||||||
.sub(layer_file_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Removing from the bottom of the layer map is expensive.
|
|
||||||
// Maybe instead discard all layer map historic versions that
|
|
||||||
// won't be needed for page reconstruction for this timeline,
|
|
||||||
// and mark what we can't delete yet as deleted from the layer
|
|
||||||
// map index without actually rebuilding the index.
|
|
||||||
updates.remove_historic(layer.layer_desc().clone());
|
|
||||||
mapping.remove(layer);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type TraversalId = String;
|
type TraversalId = String;
|
||||||
@@ -2502,7 +2358,7 @@ impl Timeline {
|
|||||||
'layer_map_search: loop {
|
'layer_map_search: loop {
|
||||||
let remote_layer = {
|
let remote_layer = {
|
||||||
let guard = timeline.layers.read().await;
|
let guard = timeline.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
|
|
||||||
// Check the open and frozen in-memory layers first, in order from newest
|
// Check the open and frozen in-memory layers first, in order from newest
|
||||||
// to oldest.
|
// to oldest.
|
||||||
@@ -2564,7 +2420,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||||
let layer = mapping.get_from_desc(&layer);
|
let layer = guard.get_from_desc(&layer);
|
||||||
// If it's a remote layer, download it and retry.
|
// If it's a remote layer, download it and retry.
|
||||||
if let Some(remote_layer) =
|
if let Some(remote_layer) =
|
||||||
super::storage_layer::downcast_remote_layer(&layer)
|
super::storage_layer::downcast_remote_layer(&layer)
|
||||||
@@ -2687,52 +2543,13 @@ impl Timeline {
|
|||||||
///
|
///
|
||||||
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
|
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, _) = &mut *guard;
|
let layer = guard.get_layer_for_write(
|
||||||
|
|
||||||
ensure!(lsn.is_aligned());
|
|
||||||
|
|
||||||
let last_record_lsn = self.get_last_record_lsn();
|
|
||||||
ensure!(
|
|
||||||
lsn > last_record_lsn,
|
|
||||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
|
|
||||||
lsn,
|
lsn,
|
||||||
last_record_lsn,
|
self.get_last_record_lsn(),
|
||||||
std::backtrace::Backtrace::force_capture(),
|
self.conf,
|
||||||
);
|
self.timeline_id,
|
||||||
|
self.tenant_id,
|
||||||
// Do we have a layer open for writing already?
|
)?;
|
||||||
let layer;
|
|
||||||
if let Some(open_layer) = &layers.open_layer {
|
|
||||||
if open_layer.get_lsn_range().start > lsn {
|
|
||||||
bail!(
|
|
||||||
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
|
|
||||||
open_layer.get_lsn_range().start,
|
|
||||||
lsn
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
layer = Arc::clone(open_layer);
|
|
||||||
} else {
|
|
||||||
// No writeable layer yet. Create one.
|
|
||||||
let start_lsn = layers
|
|
||||||
.next_open_layer_at
|
|
||||||
.context("No next open layer found")?;
|
|
||||||
|
|
||||||
trace!(
|
|
||||||
"creating layer for write at {}/{} for record at {}",
|
|
||||||
self.timeline_id,
|
|
||||||
start_lsn,
|
|
||||||
lsn
|
|
||||||
);
|
|
||||||
let new_layer =
|
|
||||||
InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?;
|
|
||||||
let layer_rc = Arc::new(new_layer);
|
|
||||||
|
|
||||||
layers.open_layer = Some(Arc::clone(&layer_rc));
|
|
||||||
layers.next_open_layer_at = None;
|
|
||||||
|
|
||||||
layer = layer_rc;
|
|
||||||
}
|
|
||||||
Ok(layer)
|
Ok(layer)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2765,21 +2582,7 @@ impl Timeline {
|
|||||||
Some(self.write_lock.lock().await)
|
Some(self.write_lock.lock().await)
|
||||||
};
|
};
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, _) = &mut *guard;
|
guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
|
||||||
if let Some(open_layer) = &layers.open_layer {
|
|
||||||
let open_layer_rc = Arc::clone(open_layer);
|
|
||||||
// Does this layer need freezing?
|
|
||||||
let end_lsn = Lsn(self.get_last_record_lsn().0 + 1);
|
|
||||||
open_layer.freeze(end_lsn);
|
|
||||||
|
|
||||||
// The layer is no longer open, update the layer map to reflect this.
|
|
||||||
// We will replace it with on-disk historics below.
|
|
||||||
layers.frozen_layers.push_back(open_layer_rc);
|
|
||||||
layers.open_layer = None;
|
|
||||||
layers.next_open_layer_at = Some(end_lsn);
|
|
||||||
self.last_freeze_at.store(end_lsn);
|
|
||||||
}
|
|
||||||
drop_wlock(guard);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Layer flusher task's main loop.
|
/// Layer flusher task's main loop.
|
||||||
@@ -2804,18 +2607,15 @@ impl Timeline {
|
|||||||
let result = loop {
|
let result = loop {
|
||||||
let layer_to_flush = {
|
let layer_to_flush = {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, _) = &*guard;
|
guard.layer_map().frozen_layers.front().cloned()
|
||||||
layers.frozen_layers.front().cloned()
|
|
||||||
// drop 'layers' lock to allow concurrent reads and writes
|
// drop 'layers' lock to allow concurrent reads and writes
|
||||||
};
|
};
|
||||||
if let Some(layer_to_flush) = layer_to_flush {
|
let Some(layer_to_flush) = layer_to_flush else {
|
||||||
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
|
|
||||||
error!("could not flush frozen layer: {err:?}");
|
|
||||||
break Err(err);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
break Ok(());
|
break Ok(());
|
||||||
|
};
|
||||||
|
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||||
|
error!("could not flush frozen layer: {err:?}");
|
||||||
|
break Err(err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// Notify any listeners that we're done
|
// Notify any listeners that we're done
|
||||||
@@ -2924,15 +2724,20 @@ impl Timeline {
|
|||||||
HashMap::from([(delta_path, metadata)])
|
HashMap::from([(delta_path, metadata)])
|
||||||
};
|
};
|
||||||
|
|
||||||
fail_point!("flush-frozen-before-sync");
|
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||||
|
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||||
|
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
||||||
|
// race situation.
|
||||||
|
// See https://github.com/neondatabase/neon/issues/4526
|
||||||
|
|
||||||
|
pausable_failpoint!("flush-frozen-before-sync");
|
||||||
|
|
||||||
// The new on-disk layers are now in the layer map. We can remove the
|
// The new on-disk layers are now in the layer map. We can remove the
|
||||||
// in-memory layer from the map now. We do not modify `LayerFileManager` because
|
// in-memory layer from the map now. The flushed layer is stored in
|
||||||
// it only contains persistent layers. The flushed layer is stored in
|
|
||||||
// the mapping in `create_delta_layer`.
|
// the mapping in `create_delta_layer`.
|
||||||
{
|
{
|
||||||
let mut layers = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let l = layers.0.frozen_layers.pop_front();
|
let l = guard.layer_map_mut().frozen_layers.pop_front();
|
||||||
|
|
||||||
// Only one thread may call this function at a time (for this
|
// Only one thread may call this function at a time (for this
|
||||||
// timeline). If two threads tried to flush the same frozen
|
// timeline). If two threads tried to flush the same frozen
|
||||||
@@ -3007,8 +2812,8 @@ impl Timeline {
|
|||||||
|
|
||||||
save_metadata(
|
save_metadata(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
&self.tenant_id,
|
||||||
self.tenant_id,
|
&self.timeline_id,
|
||||||
&metadata,
|
&metadata,
|
||||||
false,
|
false,
|
||||||
)
|
)
|
||||||
@@ -3057,7 +2862,7 @@ impl Timeline {
|
|||||||
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
|
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
|
||||||
par_fsync::par_fsync(&[self_clone
|
par_fsync::par_fsync(&[self_clone
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&self_clone.timeline_id, &self_clone.tenant_id)])
|
.timeline_path(&self_clone.tenant_id, &self_clone.timeline_id)])
|
||||||
.context("fsync of timeline dir")?;
|
.context("fsync of timeline dir")?;
|
||||||
|
|
||||||
anyhow::Ok(new_delta)
|
anyhow::Ok(new_delta)
|
||||||
@@ -3071,15 +2876,12 @@ impl Timeline {
|
|||||||
// Add it to the layer map
|
// Add it to the layer map
|
||||||
let l = Arc::new(new_delta);
|
let l = Arc::new(new_delta);
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
|
||||||
let mut batch_updates = layers.batch_update();
|
|
||||||
l.access_stats().record_residence_event(
|
l.access_stats().record_residence_event(
|
||||||
&batch_updates,
|
&guard,
|
||||||
LayerResidenceStatus::Resident,
|
LayerResidenceStatus::Resident,
|
||||||
LayerResidenceEventReason::LayerCreate,
|
LayerResidenceEventReason::LayerCreate,
|
||||||
);
|
);
|
||||||
self.insert_historic_layer(l, &mut batch_updates, mapping);
|
guard.track_new_l0_delta_layer(l);
|
||||||
batch_updates.flush();
|
|
||||||
|
|
||||||
// update metrics
|
// update metrics
|
||||||
self.metrics.resident_physical_size_gauge.add(sz);
|
self.metrics.resident_physical_size_gauge.add(sz);
|
||||||
@@ -3128,7 +2930,7 @@ impl Timeline {
|
|||||||
let threshold = self.get_image_creation_threshold();
|
let threshold = self.get_image_creation_threshold();
|
||||||
|
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, _) = &*guard;
|
let layers = guard.layer_map();
|
||||||
|
|
||||||
let mut max_deltas = 0;
|
let mut max_deltas = 0;
|
||||||
{
|
{
|
||||||
@@ -3300,18 +3102,16 @@ impl Timeline {
|
|||||||
.await
|
.await
|
||||||
.context("fsync of newly created layer files")?;
|
.context("fsync of newly created layer files")?;
|
||||||
|
|
||||||
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
|
||||||
.await
|
.await
|
||||||
.context("fsync of timeline dir")?;
|
.context("fsync of timeline dir")?;
|
||||||
|
|
||||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||||
|
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||||
let mut updates = layers.batch_update();
|
|
||||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
|
||||||
|
|
||||||
for l in image_layers {
|
for l in &image_layers {
|
||||||
let path = l.filename();
|
let path = l.filename();
|
||||||
let metadata = timeline_path
|
let metadata = timeline_path
|
||||||
.join(path.file_name())
|
.join(path.file_name())
|
||||||
@@ -3325,13 +3125,12 @@ impl Timeline {
|
|||||||
.add(metadata.len());
|
.add(metadata.len());
|
||||||
let l = Arc::new(l);
|
let l = Arc::new(l);
|
||||||
l.access_stats().record_residence_event(
|
l.access_stats().record_residence_event(
|
||||||
&updates,
|
&guard,
|
||||||
LayerResidenceStatus::Resident,
|
LayerResidenceStatus::Resident,
|
||||||
LayerResidenceEventReason::LayerCreate,
|
LayerResidenceEventReason::LayerCreate,
|
||||||
);
|
);
|
||||||
self.insert_historic_layer(l, &mut updates, mapping);
|
|
||||||
}
|
}
|
||||||
updates.flush();
|
guard.track_new_image_layers(image_layers);
|
||||||
drop_wlock(guard);
|
drop_wlock(guard);
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
|
|
||||||
@@ -3490,21 +3289,23 @@ impl Timeline {
|
|||||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||||
|
///
|
||||||
|
/// [`compact_inner`]: Self::compact_inner
|
||||||
fn compact_level0_phase1(
|
fn compact_level0_phase1(
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
|
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||||
stats.read_lock_held_spawn_blocking_startup_micros =
|
stats.read_lock_held_spawn_blocking_startup_micros =
|
||||||
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
let level0_deltas = layers.get_level0_deltas()?;
|
let level0_deltas = layers.get_level0_deltas()?;
|
||||||
let mut level0_deltas = level0_deltas
|
let mut level0_deltas = level0_deltas
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|x| mapping.get_from_desc(&x))
|
.map(|x| guard.get_from_desc(&x))
|
||||||
.collect_vec();
|
.collect_vec();
|
||||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||||
// Only compact if enough layers have accumulated.
|
// Only compact if enough layers have accumulated.
|
||||||
@@ -3824,7 +3625,7 @@ impl Timeline {
|
|||||||
// minimize latency.
|
// minimize latency.
|
||||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||||
|
|
||||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
|
||||||
.context("fsync of timeline dir")?;
|
.context("fsync of timeline dir")?;
|
||||||
|
|
||||||
layer_paths.pop().unwrap();
|
layer_paths.pop().unwrap();
|
||||||
@@ -3909,7 +3710,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Before deleting any layers, we need to wait for their upload ops to finish.
|
// Before deleting any layers, we need to wait for their upload ops to finish.
|
||||||
// See storage_sync module level comment on consistency.
|
// See remote_timeline_client module level comment on consistency.
|
||||||
// Do it here because we don't want to hold self.layers.write() while waiting.
|
// Do it here because we don't want to hold self.layers.write() while waiting.
|
||||||
if let Some(remote_client) = &self.remote_client {
|
if let Some(remote_client) = &self.remote_client {
|
||||||
debug!("waiting for upload ops to complete");
|
debug!("waiting for upload ops to complete");
|
||||||
@@ -3920,9 +3721,11 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
|
||||||
let mut updates = layers.batch_update();
|
|
||||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||||
|
|
||||||
|
let mut insert_layers = Vec::new();
|
||||||
|
let mut remove_layers = Vec::new();
|
||||||
|
|
||||||
for l in new_layers {
|
for l in new_layers {
|
||||||
let new_delta_path = l.path();
|
let new_delta_path = l.path();
|
||||||
|
|
||||||
@@ -3948,11 +3751,11 @@ impl Timeline {
|
|||||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||||
x.access_stats().record_residence_event(
|
x.access_stats().record_residence_event(
|
||||||
&updates,
|
&guard,
|
||||||
LayerResidenceStatus::Resident,
|
LayerResidenceStatus::Resident,
|
||||||
LayerResidenceEventReason::LayerCreate,
|
LayerResidenceEventReason::LayerCreate,
|
||||||
);
|
);
|
||||||
self.insert_historic_layer(x, &mut updates, mapping);
|
insert_layers.push(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||||
@@ -3960,12 +3763,16 @@ impl Timeline {
|
|||||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||||
for l in deltas_to_compact {
|
for l in deltas_to_compact {
|
||||||
layer_names_to_delete.push(l.filename());
|
layer_names_to_delete.push(l.filename());
|
||||||
// NB: the layer file identified by descriptor `l` is guaranteed to be present
|
remove_layers.push(guard.get_from_desc(&l));
|
||||||
// in the LayerFileManager because we kept holding `layer_removal_cs` the entire
|
|
||||||
// time, even though we dropped `Timeline::layers` inbetween.
|
|
||||||
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?;
|
|
||||||
}
|
}
|
||||||
updates.flush();
|
|
||||||
|
guard.finish_compact_l0(
|
||||||
|
layer_removal_cs,
|
||||||
|
remove_layers,
|
||||||
|
insert_layers,
|
||||||
|
&self.metrics,
|
||||||
|
)?;
|
||||||
|
|
||||||
drop_wlock(guard);
|
drop_wlock(guard);
|
||||||
|
|
||||||
// Also schedule the deletions in remote storage
|
// Also schedule the deletions in remote storage
|
||||||
@@ -4110,7 +3917,7 @@ impl Timeline {
|
|||||||
new_gc_cutoff,
|
new_gc_cutoff,
|
||||||
)
|
)
|
||||||
.instrument(
|
.instrument(
|
||||||
info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
|
info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -4184,7 +3991,7 @@ impl Timeline {
|
|||||||
//
|
//
|
||||||
// TODO holding a write lock is too agressive and avoidable
|
// TODO holding a write lock is too agressive and avoidable
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
let layers = guard.layer_map();
|
||||||
'outer: for l in layers.iter_historic_layers() {
|
'outer: for l in layers.iter_historic_layers() {
|
||||||
result.layers_total += 1;
|
result.layers_total += 1;
|
||||||
|
|
||||||
@@ -4280,7 +4087,6 @@ impl Timeline {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
||||||
|
|
||||||
let mut updates = layers.batch_update();
|
|
||||||
if !layers_to_remove.is_empty() {
|
if !layers_to_remove.is_empty() {
|
||||||
// Persist the new GC cutoff value in the metadata file, before
|
// Persist the new GC cutoff value in the metadata file, before
|
||||||
// we actually remove anything.
|
// we actually remove anything.
|
||||||
@@ -4290,18 +4096,15 @@ impl Timeline {
|
|||||||
// (couldn't do this in the loop above, because you cannot modify a collection
|
// (couldn't do this in the loop above, because you cannot modify a collection
|
||||||
// while iterating it. BTreeMap::retain() would be another option)
|
// while iterating it. BTreeMap::retain() would be another option)
|
||||||
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
|
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
|
||||||
{
|
let gc_layers = layers_to_remove
|
||||||
for doomed_layer in layers_to_remove {
|
.iter()
|
||||||
layer_names_to_delete.push(doomed_layer.filename());
|
.map(|x| guard.get_from_desc(x))
|
||||||
self.delete_historic_layer(
|
.collect();
|
||||||
layer_removal_cs.clone(),
|
for doomed_layer in layers_to_remove {
|
||||||
doomed_layer,
|
layer_names_to_delete.push(doomed_layer.filename());
|
||||||
&mut updates,
|
result.layers_removed += 1;
|
||||||
mapping,
|
|
||||||
)?; // FIXME: schedule succeeded deletions before returning?
|
|
||||||
result.layers_removed += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
let apply = guard.finish_gc_timeline(layer_removal_cs, gc_layers, &self.metrics)?;
|
||||||
|
|
||||||
if result.layers_removed != 0 {
|
if result.layers_removed != 0 {
|
||||||
fail_point!("after-timeline-gc-removed-layers");
|
fail_point!("after-timeline-gc-removed-layers");
|
||||||
@@ -4310,8 +4113,9 @@ impl Timeline {
|
|||||||
if let Some(remote_client) = &self.remote_client {
|
if let Some(remote_client) = &self.remote_client {
|
||||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
apply.flush();
|
||||||
}
|
}
|
||||||
updates.flush();
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"GC completed removing {} layers, cutoff {}",
|
"GC completed removing {} layers, cutoff {}",
|
||||||
@@ -4483,13 +4287,11 @@ impl Timeline {
|
|||||||
// Download complete. Replace the RemoteLayer with the corresponding
|
// Download complete. Replace the RemoteLayer with the corresponding
|
||||||
// Delta- or ImageLayer in the layer map.
|
// Delta- or ImageLayer in the layer map.
|
||||||
let mut guard = self_clone.layers.write().await;
|
let mut guard = self_clone.layers.write().await;
|
||||||
let (layers, mapping) = &mut *guard;
|
|
||||||
let updates = layers.batch_update();
|
|
||||||
let new_layer =
|
let new_layer =
|
||||||
remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
|
remote_layer.create_downloaded_layer(&guard, self_clone.conf, *size);
|
||||||
{
|
{
|
||||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||||
let failure = match mapping.replace_and_verify(l, new_layer) {
|
let failure = match guard.replace_and_verify(l, new_layer) {
|
||||||
Ok(()) => false,
|
Ok(()) => false,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// this is a precondition failure, the layer filename derived
|
// this is a precondition failure, the layer filename derived
|
||||||
@@ -4517,7 +4319,6 @@ impl Timeline {
|
|||||||
.store(true, Relaxed);
|
.store(true, Relaxed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
updates.flush();
|
|
||||||
drop_wlock(guard);
|
drop_wlock(guard);
|
||||||
|
|
||||||
info!("on-demand download successful");
|
info!("on-demand download successful");
|
||||||
@@ -4596,7 +4397,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||||
);
|
);
|
||||||
|
|
||||||
let initial_info = DownloadRemoteLayersTaskInfo {
|
let initial_info = DownloadRemoteLayersTaskInfo {
|
||||||
@@ -4618,10 +4419,10 @@ impl Timeline {
|
|||||||
let mut downloads = Vec::new();
|
let mut downloads = Vec::new();
|
||||||
{
|
{
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
layers
|
layers
|
||||||
.iter_historic_layers()
|
.iter_historic_layers()
|
||||||
.map(|l| mapping.get_from_desc(&l))
|
.map(|l| guard.get_from_desc(&l))
|
||||||
.filter_map(|l| l.downcast_remote_layer())
|
.filter_map(|l| l.downcast_remote_layer())
|
||||||
.map(|l| self.download_remote_layer(l))
|
.map(|l| self.download_remote_layer(l))
|
||||||
.for_each(|dl| downloads.push(dl))
|
.for_each(|dl| downloads.push(dl))
|
||||||
@@ -4723,7 +4524,7 @@ impl LocalLayerInfoForDiskUsageEviction {
|
|||||||
impl Timeline {
|
impl Timeline {
|
||||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
|
|
||||||
let mut max_layer_size: Option<u64> = None;
|
let mut max_layer_size: Option<u64> = None;
|
||||||
let mut resident_layers = Vec::new();
|
let mut resident_layers = Vec::new();
|
||||||
@@ -4732,7 +4533,7 @@ impl Timeline {
|
|||||||
let file_size = l.file_size();
|
let file_size = l.file_size();
|
||||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||||
|
|
||||||
let l = mapping.get_from_desc(&l);
|
let l = guard.get_from_desc(&l);
|
||||||
|
|
||||||
if l.is_remote_layer() {
|
if l.is_remote_layer() {
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
@@ -198,10 +198,10 @@ impl Timeline {
|
|||||||
// So, we just need to deal with this.
|
// So, we just need to deal with this.
|
||||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let (layers, mapping) = &*guard;
|
let layers = guard.layer_map();
|
||||||
let mut candidates = Vec::new();
|
let mut candidates = Vec::new();
|
||||||
for hist_layer in layers.iter_historic_layers() {
|
for hist_layer in layers.iter_historic_layers() {
|
||||||
let hist_layer = mapping.get_from_desc(&hist_layer);
|
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||||
if hist_layer.is_remote_layer() {
|
if hist_layer.is_remote_layer() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
370
pageserver/src/tenant/timeline/layer_manager.rs
Normal file
370
pageserver/src/tenant/timeline/layer_manager.rs
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
use tracing::trace;
|
||||||
|
use utils::{
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::{AtomicLsn, Lsn},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
config::PageServerConf,
|
||||||
|
metrics::TimelineMetrics,
|
||||||
|
tenant::{
|
||||||
|
layer_map::{BatchedUpdates, LayerMap},
|
||||||
|
storage_layer::{
|
||||||
|
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
|
||||||
|
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
|
||||||
|
},
|
||||||
|
timeline::compare_arced_layers,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Provides semantic APIs to manipulate the layer map.
|
||||||
|
pub struct LayerManager {
|
||||||
|
layer_map: LayerMap,
|
||||||
|
layer_fmgr: LayerFileManager,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
|
||||||
|
/// scheduling deletes in remote client.
|
||||||
|
pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||||
|
|
||||||
|
impl ApplyGcResultGuard<'_> {
|
||||||
|
pub fn flush(self) {
|
||||||
|
self.0.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerManager {
|
||||||
|
pub fn create() -> Self {
|
||||||
|
Self {
|
||||||
|
layer_map: LayerMap::default(),
|
||||||
|
layer_fmgr: LayerFileManager::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||||
|
self.layer_fmgr.get_from_desc(desc)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get an immutable reference to the layer map.
|
||||||
|
///
|
||||||
|
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||||
|
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||||
|
pub fn layer_map(&self) -> &LayerMap {
|
||||||
|
&self.layer_map
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
|
||||||
|
/// gets a refactor.
|
||||||
|
pub fn layer_map_mut(&mut self) -> &mut LayerMap {
|
||||||
|
&mut self.layer_map
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace layers in the layer file manager, used in evictions and layer downloads.
|
||||||
|
pub fn replace_and_verify(
|
||||||
|
&mut self,
|
||||||
|
expected: Arc<dyn PersistentLayer>,
|
||||||
|
new: Arc<dyn PersistentLayer>,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.layer_fmgr.replace_and_verify(expected, new)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||||
|
/// 1. all on-disk layers
|
||||||
|
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||||
|
pub fn initialize_local_layers(
|
||||||
|
&mut self,
|
||||||
|
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||||
|
next_open_layer_at: Lsn,
|
||||||
|
) {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
for layer in on_disk_layers {
|
||||||
|
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize when creating a new timeline, called in `init_empty_layer_map`.
|
||||||
|
pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||||
|
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn initialize_remote_layers(
|
||||||
|
&mut self,
|
||||||
|
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||||
|
remote_layers: Vec<Arc<RemoteLayer>>,
|
||||||
|
) {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
for layer in corrupted_local_layers {
|
||||||
|
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||||
|
}
|
||||||
|
for layer in remote_layers {
|
||||||
|
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||||
|
/// called within `get_layer_for_write`.
|
||||||
|
pub fn get_layer_for_write(
|
||||||
|
&mut self,
|
||||||
|
lsn: Lsn,
|
||||||
|
last_record_lsn: Lsn,
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
) -> Result<Arc<InMemoryLayer>> {
|
||||||
|
ensure!(lsn.is_aligned());
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
lsn > last_record_lsn,
|
||||||
|
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
|
||||||
|
lsn,
|
||||||
|
last_record_lsn,
|
||||||
|
std::backtrace::Backtrace::force_capture(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Do we have a layer open for writing already?
|
||||||
|
let layer = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||||
|
if open_layer.get_lsn_range().start > lsn {
|
||||||
|
bail!(
|
||||||
|
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
|
||||||
|
open_layer.get_lsn_range().start,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Arc::clone(open_layer)
|
||||||
|
} else {
|
||||||
|
// No writeable layer yet. Create one.
|
||||||
|
let start_lsn = self
|
||||||
|
.layer_map
|
||||||
|
.next_open_layer_at
|
||||||
|
.context("No next open layer found")?;
|
||||||
|
|
||||||
|
trace!(
|
||||||
|
"creating in-memory layer at {}/{} for record at {}",
|
||||||
|
timeline_id,
|
||||||
|
start_lsn,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
|
||||||
|
let layer = Arc::new(new_layer);
|
||||||
|
|
||||||
|
self.layer_map.open_layer = Some(layer.clone());
|
||||||
|
self.layer_map.next_open_layer_at = None;
|
||||||
|
|
||||||
|
layer
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(layer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||||
|
pub fn try_freeze_in_memory_layer(
|
||||||
|
&mut self,
|
||||||
|
Lsn(last_record_lsn): Lsn,
|
||||||
|
last_freeze_at: &AtomicLsn,
|
||||||
|
) {
|
||||||
|
let end_lsn = Lsn(last_record_lsn + 1);
|
||||||
|
|
||||||
|
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||||
|
let open_layer_rc = Arc::clone(open_layer);
|
||||||
|
// Does this layer need freezing?
|
||||||
|
open_layer.freeze(end_lsn);
|
||||||
|
|
||||||
|
// The layer is no longer open, update the layer map to reflect this.
|
||||||
|
// We will replace it with on-disk historics below.
|
||||||
|
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||||
|
self.layer_map.open_layer = None;
|
||||||
|
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||||
|
last_freeze_at.store(end_lsn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||||
|
pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
for layer in image_layers {
|
||||||
|
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
|
||||||
|
pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
|
||||||
|
updates.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called when compaction is completed.
|
||||||
|
pub fn finish_compact_l0(
|
||||||
|
&mut self,
|
||||||
|
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
|
compact_from: Vec<Arc<dyn PersistentLayer>>,
|
||||||
|
compact_to: Vec<Arc<dyn PersistentLayer>>,
|
||||||
|
metrics: &TimelineMetrics,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
for l in compact_to {
|
||||||
|
Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||||
|
}
|
||||||
|
for l in compact_from {
|
||||||
|
// NB: the layer file identified by descriptor `l` is guaranteed to be present
|
||||||
|
// in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
|
||||||
|
// time, even though we dropped `Timeline::layers` inbetween.
|
||||||
|
Self::delete_historic_layer(
|
||||||
|
layer_removal_cs.clone(),
|
||||||
|
l,
|
||||||
|
&mut updates,
|
||||||
|
metrics,
|
||||||
|
&mut self.layer_fmgr,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||||
|
pub fn finish_gc_timeline(
|
||||||
|
&mut self,
|
||||||
|
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
|
gc_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||||
|
metrics: &TimelineMetrics,
|
||||||
|
) -> Result<ApplyGcResultGuard> {
|
||||||
|
let mut updates = self.layer_map.batch_update();
|
||||||
|
for doomed_layer in gc_layers {
|
||||||
|
Self::delete_historic_layer(
|
||||||
|
layer_removal_cs.clone(),
|
||||||
|
doomed_layer,
|
||||||
|
&mut updates,
|
||||||
|
metrics,
|
||||||
|
&mut self.layer_fmgr,
|
||||||
|
)?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
|
||||||
|
}
|
||||||
|
Ok(ApplyGcResultGuard(updates))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to insert a layer into the layer map and file manager.
|
||||||
|
fn insert_historic_layer(
|
||||||
|
layer: Arc<dyn PersistentLayer>,
|
||||||
|
updates: &mut BatchedUpdates<'_>,
|
||||||
|
mapping: &mut LayerFileManager,
|
||||||
|
) {
|
||||||
|
updates.insert_historic(layer.layer_desc().clone());
|
||||||
|
mapping.insert(layer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to remove a layer into the layer map and file manager
|
||||||
|
fn remove_historic_layer(
|
||||||
|
layer: Arc<dyn PersistentLayer>,
|
||||||
|
updates: &mut BatchedUpdates<'_>,
|
||||||
|
mapping: &mut LayerFileManager,
|
||||||
|
) {
|
||||||
|
updates.remove_historic(layer.layer_desc().clone());
|
||||||
|
mapping.remove(layer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes the layer from local FS (if present) and from memory.
|
||||||
|
/// Remote storage is not affected by this operation.
|
||||||
|
fn delete_historic_layer(
|
||||||
|
// we cannot remove layers otherwise, since gc and compaction will race
|
||||||
|
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
|
layer: Arc<dyn PersistentLayer>,
|
||||||
|
updates: &mut BatchedUpdates<'_>,
|
||||||
|
metrics: &TimelineMetrics,
|
||||||
|
mapping: &mut LayerFileManager,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
if !layer.is_remote_layer() {
|
||||||
|
layer.delete_resident_layer_file()?;
|
||||||
|
let layer_file_size = layer.file_size();
|
||||||
|
metrics.resident_physical_size_gauge.sub(layer_file_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO Removing from the bottom of the layer map is expensive.
|
||||||
|
// Maybe instead discard all layer map historic versions that
|
||||||
|
// won't be needed for page reconstruction for this timeline,
|
||||||
|
// and mark what we can't delete yet as deleted from the layer
|
||||||
|
// map index without actually rebuilding the index.
|
||||||
|
updates.remove_historic(layer.layer_desc().clone());
|
||||||
|
mapping.remove(layer);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||||
|
HashMap<PersistentLayerKey, Arc<T>>,
|
||||||
|
);
|
||||||
|
|
||||||
|
impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||||
|
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
|
||||||
|
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||||
|
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||||
|
self.0
|
||||||
|
.get(&desc.key())
|
||||||
|
.with_context(|| format!("get layer from desc: {}", desc.filename()))
|
||||||
|
.expect("not found")
|
||||||
|
.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn insert(&mut self, layer: Arc<T>) {
|
||||||
|
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||||
|
if present.is_some() && cfg!(debug_assertions) {
|
||||||
|
panic!("overwriting a layer: {:?}", layer.layer_desc())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn new() -> Self {
|
||||||
|
Self(HashMap::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn remove(&mut self, layer: Arc<T>) {
|
||||||
|
let present = self.0.remove(&layer.layer_desc().key());
|
||||||
|
if present.is_none() && cfg!(debug_assertions) {
|
||||||
|
panic!(
|
||||||
|
"removing layer that is not present in layer mapping: {:?}",
|
||||||
|
layer.layer_desc()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
|
||||||
|
let key = expected.layer_desc().key();
|
||||||
|
let other = new.layer_desc().key();
|
||||||
|
|
||||||
|
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
||||||
|
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
||||||
|
|
||||||
|
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
||||||
|
"layermap-replace-notfound"
|
||||||
|
));
|
||||||
|
|
||||||
|
anyhow::ensure!(
|
||||||
|
key == other,
|
||||||
|
"expected and new layer have different keys: {key:?} != {other:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
anyhow::ensure!(
|
||||||
|
expected_l0 == new_l0,
|
||||||
|
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Some(layer) = self.0.get_mut(&key) {
|
||||||
|
anyhow::ensure!(
|
||||||
|
compare_arced_layers(&expected, layer),
|
||||||
|
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
||||||
|
expected = Arc::as_ptr(&expected),
|
||||||
|
new = Arc::as_ptr(layer),
|
||||||
|
);
|
||||||
|
*layer = new;
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("layer was not found");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,19 +7,14 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
|
|||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
#[track_caller]
|
#[track_caller]
|
||||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
||||||
static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
|
static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||||
once_cell::sync::Lazy::new(|| {
|
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
|
||||||
MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
|
|
||||||
});
|
|
||||||
|
|
||||||
let fields: [&dyn Extractor; 2] = [
|
let fields: [&dyn Extractor; 2] = [
|
||||||
&*crate::tenant::span::TENANT_ID_EXTRACTOR,
|
&*crate::tenant::span::TENANT_ID_EXTRACTOR,
|
||||||
&*TIMELINE_ID_EXTRACTOR,
|
&*TIMELINE_ID_EXTRACTOR,
|
||||||
];
|
];
|
||||||
if let Err(missing) = check_fields_present(fields) {
|
if let Err(missing) = check_fields_present!(fields) {
|
||||||
panic!(
|
panic!("missing extractors: {missing:?}")
|
||||||
"missing extractors: {:?}",
|
|
||||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
impl Drop for UninitializedTimeline<'_> {
|
impl Drop for UninitializedTimeline<'_> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||||
let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
|
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
|
||||||
error!("Timeline got dropped without initializing, cleaning its files");
|
error!("Timeline got dropped without initializing, cleaning its files");
|
||||||
cleanup_timeline_directory(uninit_mark);
|
cleanup_timeline_directory(uninit_mark);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
//! Current connection state is tracked too, to ensure it's not getting stale.
|
//! Current connection state is tracked too, to ensure it's not getting stale.
|
||||||
//!
|
//!
|
||||||
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
||||||
//! then a [re]connection happens, if necessary.
|
//! then a (re)connection happens, if necessary.
|
||||||
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
||||||
|
|
||||||
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
||||||
@@ -266,7 +266,7 @@ pub struct ConnectionManagerStatus {
|
|||||||
impl ConnectionManagerStatus {
|
impl ConnectionManagerStatus {
|
||||||
/// Generates a string, describing current connection status in a form, suitable for logging.
|
/// Generates a string, describing current connection status in a form, suitable for logging.
|
||||||
pub fn to_human_readable_string(&self) -> String {
|
pub fn to_human_readable_string(&self) -> String {
|
||||||
let mut resulting_string = "WalReceiver status".to_string();
|
let mut resulting_string = String::new();
|
||||||
match &self.existing_connection {
|
match &self.existing_connection {
|
||||||
Some(connection) => {
|
Some(connection) => {
|
||||||
if connection.has_processed_wal {
|
if connection.has_processed_wal {
|
||||||
|
|||||||
@@ -360,6 +360,7 @@ impl XlXactParsedRecord {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut xnodes = Vec::<RelFileNode>::new();
|
let mut xnodes = Vec::<RelFileNode>::new();
|
||||||
|
// In v16 this XACT_XINFO_HAS_RELFILENODES is renamed to XACT_XINFO_HAS_RELFILELOCATORS
|
||||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||||
let nrels = buf.get_i32_le();
|
let nrels = buf.get_i32_le();
|
||||||
for _i in 0..nrels {
|
for _i in 0..nrels {
|
||||||
|
|||||||
@@ -175,8 +175,8 @@ impl WalRedoManager for PostgresRedoManager {
|
|||||||
let mut img = base_img.map(|p| p.1);
|
let mut img = base_img.map(|p| p.1);
|
||||||
let mut batch_neon = can_apply_in_neon(&records[0].1);
|
let mut batch_neon = can_apply_in_neon(&records[0].1);
|
||||||
let mut batch_start = 0;
|
let mut batch_start = 0;
|
||||||
for i in 1..records.len() {
|
for (i, record) in records.iter().enumerate().skip(1) {
|
||||||
let rec_neon = can_apply_in_neon(&records[i].1);
|
let rec_neon = can_apply_in_neon(&record.1);
|
||||||
|
|
||||||
if rec_neon != batch_neon {
|
if rec_neon != batch_neon {
|
||||||
let result = if batch_neon {
|
let result = if batch_neon {
|
||||||
@@ -685,7 +685,7 @@ impl PostgresRedoManager {
|
|||||||
// as close-on-exec by default, but that's not enough, since we use
|
// as close-on-exec by default, but that's not enough, since we use
|
||||||
// libraries that directly call libc open without setting that flag.
|
// libraries that directly call libc open without setting that flag.
|
||||||
.close_fds()
|
.close_fds()
|
||||||
.spawn_no_leak_child()
|
.spawn_no_leak_child(self.tenant_id)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
Error::new(
|
Error::new(
|
||||||
e.kind(),
|
e.kind(),
|
||||||
@@ -989,6 +989,7 @@ impl PostgresRedoManager {
|
|||||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||||
/// will be killed and waited-for by this process before being dropped.
|
/// will be killed and waited-for by this process before being dropped.
|
||||||
struct NoLeakChild {
|
struct NoLeakChild {
|
||||||
|
tenant_id: TenantId,
|
||||||
child: Option<Child>,
|
child: Option<Child>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1007,9 +1008,12 @@ impl DerefMut for NoLeakChild {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl NoLeakChild {
|
impl NoLeakChild {
|
||||||
fn spawn(command: &mut Command) -> io::Result<Self> {
|
fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
|
||||||
let child = command.spawn()?;
|
let child = command.spawn()?;
|
||||||
Ok(NoLeakChild { child: Some(child) })
|
Ok(NoLeakChild {
|
||||||
|
tenant_id,
|
||||||
|
child: Some(child),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn kill_and_wait(mut self) {
|
fn kill_and_wait(mut self) {
|
||||||
@@ -1056,11 +1060,16 @@ impl Drop for NoLeakChild {
|
|||||||
Some(child) => child,
|
Some(child) => child,
|
||||||
None => return,
|
None => return,
|
||||||
};
|
};
|
||||||
|
let tenant_id = self.tenant_id;
|
||||||
// Offload the kill+wait of the child process into the background.
|
// Offload the kill+wait of the child process into the background.
|
||||||
// If someone stops the runtime, we'll leak the child process.
|
// If someone stops the runtime, we'll leak the child process.
|
||||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||||
BACKGROUND_RUNTIME.spawn(async move {
|
BACKGROUND_RUNTIME.spawn(async move {
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
|
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||||
|
// This thread here is going to outlive of our dropper.
|
||||||
|
let span = tracing::info_span!("walredo", %tenant_id);
|
||||||
|
let _entered = span.enter();
|
||||||
Self::kill_and_wait_impl(child);
|
Self::kill_and_wait_impl(child);
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
@@ -1069,12 +1078,12 @@ impl Drop for NoLeakChild {
|
|||||||
}
|
}
|
||||||
|
|
||||||
trait NoLeakChildCommandExt {
|
trait NoLeakChildCommandExt {
|
||||||
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
|
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl NoLeakChildCommandExt for Command {
|
impl NoLeakChildCommandExt for Command {
|
||||||
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
|
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
|
||||||
NoLeakChild::spawn(self)
|
NoLeakChild::spawn(tenant_id, self)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
|
|||||||
struct sysinfo si;
|
struct sysinfo si;
|
||||||
Size total;
|
Size total;
|
||||||
if (sysinfo(&si) < 0)
|
if (sysinfo(&si) < 0)
|
||||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
elog(ERROR, "Failed to get amount of RAM: %n");
|
||||||
|
|
||||||
total = si.totalram*si.mem_unit;
|
total = si.totalram*si.mem_unit;
|
||||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||||
@@ -580,7 +580,6 @@ l2_distance(PG_FUNCTION_ARGS)
|
|||||||
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
|
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma clang loop vectorize(enable)
|
|
||||||
for (int i = 0; i < a_dim; i++)
|
for (int i = 0; i < a_dim; i++)
|
||||||
{
|
{
|
||||||
diff = ax[i] - bx[i];
|
diff = ax[i] - bx[i];
|
||||||
|
|||||||
@@ -223,7 +223,6 @@ dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
|
|||||||
{
|
{
|
||||||
dist_t distance = 0.0;
|
dist_t distance = 0.0;
|
||||||
|
|
||||||
#pragma clang loop vectorize(enable)
|
|
||||||
for (size_t i = 0; i < n; i++)
|
for (size_t i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
dist_t diff = x[i] - y[i];
|
dist_t diff = x[i] - y[i];
|
||||||
|
|||||||
@@ -25,7 +25,11 @@
|
|||||||
#include "pagestore_client.h"
|
#include "pagestore_client.h"
|
||||||
#include "access/parallel.h"
|
#include "access/parallel.h"
|
||||||
#include "postmaster/bgworker.h"
|
#include "postmaster/bgworker.h"
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "storage/relfilelocator.h"
|
||||||
|
#else
|
||||||
#include "storage/relfilenode.h"
|
#include "storage/relfilenode.h"
|
||||||
|
#endif
|
||||||
#include "storage/buf_internals.h"
|
#include "storage/buf_internals.h"
|
||||||
#include "storage/latch.h"
|
#include "storage/latch.h"
|
||||||
#include "storage/ipc.h"
|
#include "storage/ipc.h"
|
||||||
@@ -39,6 +43,7 @@
|
|||||||
#include "postmaster/bgworker.h"
|
#include "postmaster/bgworker.h"
|
||||||
#include "postmaster/interrupt.h"
|
#include "postmaster/interrupt.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local file cache is used to temporary store relations pages in local file system.
|
* Local file cache is used to temporary store relations pages in local file system.
|
||||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||||
@@ -360,9 +365,12 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
|||||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
tag.rnode = rnode;
|
#if PG_VERSION_NUM >= 160000
|
||||||
tag.forkNum = forkNum;
|
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
#else
|
||||||
|
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
hash = get_hash_value(lfc_hash, &tag);
|
hash = get_hash_value(lfc_hash, &tag);
|
||||||
|
|
||||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||||
@@ -387,7 +395,11 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
|||||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#else
|
||||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
hash = get_hash_value(lfc_hash, &tag);
|
hash = get_hash_value(lfc_hash, &tag);
|
||||||
|
|
||||||
@@ -457,10 +469,12 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
|
|
||||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||||
return false;
|
return false;
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#else
|
||||||
|
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
tag.rnode = rnode;
|
|
||||||
tag.forkNum = forkNum;
|
|
||||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
|
||||||
hash = get_hash_value(lfc_hash, &tag);
|
hash = get_hash_value(lfc_hash, &tag);
|
||||||
|
|
||||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||||
@@ -526,9 +540,12 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||||
return;
|
return;
|
||||||
|
|
||||||
tag.rnode = rnode;
|
#if PG_VERSION_NUM >= 160000
|
||||||
tag.forkNum = forkNum;
|
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
#else
|
||||||
|
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
hash = get_hash_value(lfc_hash, &tag);
|
hash = get_hash_value(lfc_hash, &tag);
|
||||||
|
|
||||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||||
@@ -722,9 +739,16 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
|||||||
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
|
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
|
||||||
{
|
{
|
||||||
fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
|
fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
fctx->record[n_pages].relfilenode = entry->key.relNumber;
|
||||||
|
fctx->record[n_pages].reltablespace = entry->key.spcOid;
|
||||||
|
fctx->record[n_pages].reldatabase = entry->key.dbOid;
|
||||||
|
#else
|
||||||
fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
|
fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
|
||||||
fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
|
fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
|
||||||
fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
|
fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
|
||||||
|
#endif
|
||||||
fctx->record[n_pages].forknum = entry->key.forkNum;
|
fctx->record[n_pages].forknum = entry->key.forkNum;
|
||||||
fctx->record[n_pages].blocknum = entry->key.blockNum + i;
|
fctx->record[n_pages].blocknum = entry->key.blockNum + i;
|
||||||
fctx->record[n_pages].accesscount = entry->access_count;
|
fctx->record[n_pages].accesscount = entry->access_count;
|
||||||
|
|||||||
@@ -16,7 +16,11 @@
|
|||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
#include "access/xlogdefs.h"
|
#include "access/xlogdefs.h"
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "storage/relfilelocator.h"
|
||||||
|
#else
|
||||||
#include "storage/relfilenode.h"
|
#include "storage/relfilenode.h"
|
||||||
|
#endif
|
||||||
#include "storage/block.h"
|
#include "storage/block.h"
|
||||||
#include "storage/smgr.h"
|
#include "storage/smgr.h"
|
||||||
#include "lib/stringinfo.h"
|
#include "lib/stringinfo.h"
|
||||||
@@ -25,6 +29,34 @@
|
|||||||
|
|
||||||
#include "pg_config.h"
|
#include "pg_config.h"
|
||||||
|
|
||||||
|
// This is a hack to avoid too many ifdefs in the function definitions.
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
typedef RelFileLocator RelFileNode;
|
||||||
|
typedef RelFileLocatorBackend RelFileNodeBackend;
|
||||||
|
#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
|
||||||
|
#define RnodeGetSpcOid(rnode) (rnode.spcOid)
|
||||||
|
#define RnodeGetDbOid(rnode) (rnode.dbOid)
|
||||||
|
#define RnodeGetRelNumber(rnode) (rnode.relNumber)
|
||||||
|
|
||||||
|
#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
|
||||||
|
#else
|
||||||
|
#define RelnGetRnode(reln) (reln->smgr_rnode.node)
|
||||||
|
#define RnodeGetSpcOid(rnode) (rnode.spcNode)
|
||||||
|
#define RnodeGetDbOid(rnode) (rnode.dbNode)
|
||||||
|
#define RnodeGetRelNumber(rnode) (rnode.relNode)
|
||||||
|
|
||||||
|
#define BufTagGetRnode(tag) (tag.rnode)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||||
|
#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
|
||||||
|
#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
/* pagestore_client -> pagestore */
|
/* pagestore_client -> pagestore */
|
||||||
@@ -85,7 +117,7 @@ typedef struct
|
|||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
NeonRequest req;
|
NeonRequest req;
|
||||||
Oid dbNode;
|
Oid dbOid;
|
||||||
} NeonDbSizeRequest;
|
} NeonDbSizeRequest;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
|
|||||||
@@ -58,7 +58,11 @@
|
|||||||
#include "postmaster/autovacuum.h"
|
#include "postmaster/autovacuum.h"
|
||||||
#include "replication/walsender.h"
|
#include "replication/walsender.h"
|
||||||
#include "storage/bufmgr.h"
|
#include "storage/bufmgr.h"
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "storage/relfilelocator.h"
|
||||||
|
#else
|
||||||
#include "storage/relfilenode.h"
|
#include "storage/relfilenode.h"
|
||||||
|
#endif
|
||||||
#include "storage/buf_internals.h"
|
#include "storage/buf_internals.h"
|
||||||
#include "storage/smgr.h"
|
#include "storage/smgr.h"
|
||||||
#include "storage/md.h"
|
#include "storage/md.h"
|
||||||
@@ -70,6 +74,8 @@
|
|||||||
#include "access/xlogrecovery.h"
|
#include "access/xlogrecovery.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
|
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
|
||||||
* calls to md.c, and *also* do the calls to the Page Server. On every
|
* calls to md.c, and *also* do the calls to the Page Server. On every
|
||||||
@@ -86,7 +92,10 @@
|
|||||||
static char *hexdump_page(char *page);
|
static char *hexdump_page(char *page);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
|
|
||||||
|
#define IS_LOCAL_REL(reln) (RelnGetDbOid(reln) != 0 && RelnGetRelNumber(reln) > FirstNormalObjectId)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const int SmgrTrace = DEBUG5;
|
const int SmgrTrace = DEBUG5;
|
||||||
|
|
||||||
@@ -184,7 +193,13 @@ typedef struct PrfHashEntry {
|
|||||||
sizeof(BufferTag) \
|
sizeof(BufferTag) \
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&((a)->buftag),&((b)->buftag)))
|
||||||
|
#else
|
||||||
#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
|
#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SH_SCOPE static inline
|
#define SH_SCOPE static inline
|
||||||
#define SH_DEFINE
|
#define SH_DEFINE
|
||||||
#define SH_DECLARE
|
#define SH_DECLARE
|
||||||
@@ -634,7 +649,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
|||||||
.req.tag = T_NeonGetPageRequest,
|
.req.tag = T_NeonGetPageRequest,
|
||||||
.req.latest = false,
|
.req.latest = false,
|
||||||
.req.lsn = 0,
|
.req.lsn = 0,
|
||||||
.rnode = slot->buftag.rnode,
|
.rnode = BufTagGetRnode(slot->buftag),
|
||||||
.forknum = slot->buftag.forkNum,
|
.forknum = slot->buftag.forkNum,
|
||||||
.blkno = slot->buftag.blockNum,
|
.blkno = slot->buftag.blockNum,
|
||||||
};
|
};
|
||||||
@@ -649,7 +664,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
|||||||
{
|
{
|
||||||
XLogRecPtr lsn = neon_get_request_lsn(
|
XLogRecPtr lsn = neon_get_request_lsn(
|
||||||
&request.req.latest,
|
&request.req.latest,
|
||||||
slot->buftag.rnode,
|
BufTagGetRnode(slot->buftag),
|
||||||
slot->buftag.forkNum,
|
slot->buftag.forkNum,
|
||||||
slot->buftag.blockNum
|
slot->buftag.blockNum
|
||||||
);
|
);
|
||||||
@@ -729,8 +744,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
|||||||
Assert(slot->status != PRFS_UNUSED);
|
Assert(slot->status != PRFS_UNUSED);
|
||||||
Assert(MyPState->ring_last <= ring_index &&
|
Assert(MyPState->ring_last <= ring_index &&
|
||||||
ring_index < MyPState->ring_unused);
|
ring_index < MyPState->ring_unused);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
Assert(BufferTagsEqual(&slot->buftag, &tag));
|
||||||
|
#else
|
||||||
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
|
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
|
||||||
|
#endif
|
||||||
/*
|
/*
|
||||||
* If we want a specific lsn, we do not accept requests that were made
|
* If we want a specific lsn, we do not accept requests that were made
|
||||||
* with a potentially different LSN.
|
* with a potentially different LSN.
|
||||||
@@ -893,9 +911,9 @@ nm_pack_request(NeonRequest * msg)
|
|||||||
|
|
||||||
pq_sendbyte(&s, msg_req->req.latest);
|
pq_sendbyte(&s, msg_req->req.latest);
|
||||||
pq_sendint64(&s, msg_req->req.lsn);
|
pq_sendint64(&s, msg_req->req.lsn);
|
||||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||||
pq_sendbyte(&s, msg_req->forknum);
|
pq_sendbyte(&s, msg_req->forknum);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -906,9 +924,9 @@ nm_pack_request(NeonRequest * msg)
|
|||||||
|
|
||||||
pq_sendbyte(&s, msg_req->req.latest);
|
pq_sendbyte(&s, msg_req->req.latest);
|
||||||
pq_sendint64(&s, msg_req->req.lsn);
|
pq_sendint64(&s, msg_req->req.lsn);
|
||||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||||
pq_sendbyte(&s, msg_req->forknum);
|
pq_sendbyte(&s, msg_req->forknum);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -919,7 +937,7 @@ nm_pack_request(NeonRequest * msg)
|
|||||||
|
|
||||||
pq_sendbyte(&s, msg_req->req.latest);
|
pq_sendbyte(&s, msg_req->req.latest);
|
||||||
pq_sendint64(&s, msg_req->req.lsn);
|
pq_sendint64(&s, msg_req->req.lsn);
|
||||||
pq_sendint32(&s, msg_req->dbNode);
|
pq_sendint32(&s, msg_req->dbOid);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -929,9 +947,9 @@ nm_pack_request(NeonRequest * msg)
|
|||||||
|
|
||||||
pq_sendbyte(&s, msg_req->req.latest);
|
pq_sendbyte(&s, msg_req->req.latest);
|
||||||
pq_sendint64(&s, msg_req->req.lsn);
|
pq_sendint64(&s, msg_req->req.lsn);
|
||||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||||
pq_sendbyte(&s, msg_req->forknum);
|
pq_sendbyte(&s, msg_req->forknum);
|
||||||
pq_sendint32(&s, msg_req->blkno);
|
pq_sendint32(&s, msg_req->blkno);
|
||||||
|
|
||||||
@@ -1064,9 +1082,9 @@ nm_to_string(NeonMessage * msg)
|
|||||||
|
|
||||||
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
|
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
|
||||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||||
msg_req->rnode.spcNode,
|
RnodeGetSpcOid(msg_req->rnode),
|
||||||
msg_req->rnode.dbNode,
|
RnodeGetDbOid(msg_req->rnode),
|
||||||
msg_req->rnode.relNode);
|
RnodeGetRelNumber(msg_req->rnode));
|
||||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||||
@@ -1080,9 +1098,9 @@ nm_to_string(NeonMessage * msg)
|
|||||||
|
|
||||||
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
|
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
|
||||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||||
msg_req->rnode.spcNode,
|
RnodeGetSpcOid(msg_req->rnode),
|
||||||
msg_req->rnode.dbNode,
|
RnodeGetDbOid(msg_req->rnode),
|
||||||
msg_req->rnode.relNode);
|
RnodeGetRelNumber(msg_req->rnode));
|
||||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||||
@@ -1096,9 +1114,9 @@ nm_to_string(NeonMessage * msg)
|
|||||||
|
|
||||||
appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
|
appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
|
||||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||||
msg_req->rnode.spcNode,
|
RnodeGetSpcOid(msg_req->rnode),
|
||||||
msg_req->rnode.dbNode,
|
RnodeGetDbOid(msg_req->rnode),
|
||||||
msg_req->rnode.relNode);
|
RnodeGetRelNumber(msg_req->rnode));
|
||||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||||
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
|
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
|
||||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||||
@@ -1111,7 +1129,7 @@ nm_to_string(NeonMessage * msg)
|
|||||||
NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
|
NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
|
||||||
|
|
||||||
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
|
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
|
||||||
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
|
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbOid);
|
||||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||||
appendStringInfoChar(&s, '}');
|
appendStringInfoChar(&s, '}');
|
||||||
@@ -1213,6 +1231,7 @@ static void
|
|||||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||||
{
|
{
|
||||||
XLogRecPtr lsn = PageGetLSN(buffer);
|
XLogRecPtr lsn = PageGetLSN(buffer);
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
if (ShutdownRequestPending)
|
if (ShutdownRequestPending)
|
||||||
return;
|
return;
|
||||||
@@ -1232,15 +1251,16 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
/* FSM is never WAL-logged and we don't care. */
|
/* FSM is never WAL-logged and we don't care. */
|
||||||
XLogRecPtr recptr;
|
XLogRecPtr recptr;
|
||||||
|
|
||||||
recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
|
|
||||||
|
recptr = log_newpage_copy(&rnode, forknum, blocknum, buffer, false);
|
||||||
XLogFlush(recptr);
|
XLogFlush(recptr);
|
||||||
lsn = recptr;
|
lsn = recptr;
|
||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||||
blocknum,
|
blocknum,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||||
}
|
}
|
||||||
else if (lsn == InvalidXLogRecPtr)
|
else if (lsn == InvalidXLogRecPtr)
|
||||||
@@ -1268,9 +1288,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
|
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||||
blocknum,
|
blocknum,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum)));
|
forknum)));
|
||||||
}
|
}
|
||||||
else if (PageIsEmptyHeapPage(buffer))
|
else if (PageIsEmptyHeapPage(buffer))
|
||||||
@@ -1278,9 +1298,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||||
blocknum,
|
blocknum,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum)));
|
forknum)));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -1288,9 +1308,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
ereport(PANIC,
|
ereport(PANIC,
|
||||||
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||||
blocknum,
|
blocknum,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum)));
|
forknum)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1299,9 +1319,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||||
blocknum,
|
blocknum,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1309,7 +1329,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
|||||||
* Remember the LSN on this page. When we read the page again, we must
|
* Remember the LSN on this page. When we read the page again, we must
|
||||||
* read the same or newer version of it.
|
* read the same or newer version of it.
|
||||||
*/
|
*/
|
||||||
SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum);
|
SetLastWrittenLSNForBlock(lsn, rnode, forknum, blocknum);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1459,6 +1479,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
BlockNumber n_blocks;
|
BlockNumber n_blocks;
|
||||||
bool latest;
|
bool latest;
|
||||||
XLogRecPtr request_lsn;
|
XLogRecPtr request_lsn;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -1485,7 +1506,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
|
if (get_cached_relsize(RelnGetRnode(reln), forkNum, &n_blocks))
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -1500,20 +1521,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
*
|
*
|
||||||
* For now, handle that special case here.
|
* For now, handle that special case here.
|
||||||
*/
|
*/
|
||||||
if (reln->smgr_rnode.node.spcNode == 0 &&
|
if (RelnGetSpcOid(reln) == 0 &&
|
||||||
reln->smgr_rnode.node.dbNode == 0 &&
|
RelnGetDbOid(reln) == 0 &&
|
||||||
reln->smgr_rnode.node.relNode == 0)
|
RelnGetRelNumber(reln) == 0)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
|
request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||||
{
|
{
|
||||||
NeonExistsRequest request = {
|
NeonExistsRequest request = {
|
||||||
.req.tag = T_NeonExistsRequest,
|
.req.tag = T_NeonExistsRequest,
|
||||||
.req.latest = latest,
|
.req.latest = latest,
|
||||||
.req.lsn = request_lsn,
|
.req.lsn = request_lsn,
|
||||||
.rnode = reln->smgr_rnode.node,
|
.rnode = rnode,
|
||||||
.forknum = forkNum};
|
.forknum = forkNum};
|
||||||
|
|
||||||
resp = page_server_request(&request);
|
resp = page_server_request(&request);
|
||||||
@@ -1529,9 +1550,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_IO_ERROR),
|
(errcode(ERRCODE_IO_ERROR),
|
||||||
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||||
errdetail("page server returned error: %s",
|
errdetail("page server returned error: %s",
|
||||||
@@ -1553,6 +1574,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
void
|
void
|
||||||
neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||||
{
|
{
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
@@ -1571,9 +1594,8 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
|||||||
}
|
}
|
||||||
|
|
||||||
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
|
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
|
||||||
forkNum);
|
forkNum);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1597,12 +1619,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
|||||||
*/
|
*/
|
||||||
if (isRedo)
|
if (isRedo)
|
||||||
{
|
{
|
||||||
update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
update_cached_relsize(rnode, forkNum, 0);
|
||||||
get_cached_relsize(reln->smgr_rnode.node, forkNum,
|
get_cached_relsize(rnode, forkNum,
|
||||||
&reln->smgr_cached_nblocks[forkNum]);
|
&reln->smgr_cached_nblocks[forkNum]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
set_cached_relsize(rnode, forkNum, 0);
|
||||||
|
|
||||||
#ifdef DEBUG_COMPARE_LOCAL
|
#ifdef DEBUG_COMPARE_LOCAL
|
||||||
if (IS_LOCAL_REL(reln))
|
if (IS_LOCAL_REL(reln))
|
||||||
@@ -1639,7 +1661,12 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
|
|||||||
mdunlink(rnode, forkNum, isRedo);
|
mdunlink(rnode, forkNum, isRedo);
|
||||||
if (!RelFileNodeBackendIsTemp(rnode))
|
if (!RelFileNodeBackendIsTemp(rnode))
|
||||||
{
|
{
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
forget_cached_relsize(rnode.locator, forkNum);
|
||||||
|
#else
|
||||||
forget_cached_relsize(rnode.node, forkNum);
|
forget_cached_relsize(rnode.node, forkNum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1658,6 +1685,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
XLogRecPtr lsn;
|
XLogRecPtr lsn;
|
||||||
BlockNumber n_blocks = 0;
|
BlockNumber n_blocks = 0;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -1707,17 +1735,16 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
|
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
|
||||||
|
|
||||||
neon_wallog_page(reln, forkNum, blkno, buffer, false);
|
neon_wallog_page(reln, forkNum, blkno, buffer, false);
|
||||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
|
set_cached_relsize(rnode, forkNum, blkno + 1);
|
||||||
|
|
||||||
lsn = PageGetLSN(buffer);
|
lsn = PageGetLSN(buffer);
|
||||||
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
|
||||||
forkNum, blkno,
|
forkNum, blkno,
|
||||||
(uint32) (lsn >> 32), (uint32) lsn);
|
(uint32) (lsn >> 32), (uint32) lsn);
|
||||||
|
|
||||||
lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
|
lfc_write(rnode, forkNum, blkno, buffer);
|
||||||
|
|
||||||
#ifdef DEBUG_COMPARE_LOCAL
|
#ifdef DEBUG_COMPARE_LOCAL
|
||||||
if (IS_LOCAL_REL(reln))
|
if (IS_LOCAL_REL(reln))
|
||||||
@@ -1732,9 +1759,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
if (lsn == InvalidXLogRecPtr)
|
if (lsn == InvalidXLogRecPtr)
|
||||||
{
|
{
|
||||||
lsn = GetXLogInsertRecPtr();
|
lsn = GetXLogInsertRecPtr();
|
||||||
SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno);
|
SetLastWrittenLSNForBlock(lsn, rnode, forkNum, blkno);
|
||||||
}
|
}
|
||||||
SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum);
|
SetLastWrittenLSNForRelation(lsn, rnode, forkNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1778,6 +1805,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
|||||||
BufferTag tag;
|
BufferTag tag;
|
||||||
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
|
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
|
||||||
|
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
case 0: /* probably shouldn't happen, but ignore it */
|
case 0: /* probably shouldn't happen, but ignore it */
|
||||||
@@ -1792,15 +1821,18 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
|||||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
|
if (lfc_cache_contains(rnode, forknum, blocknum))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&tag, &rnode, forknum, blocknum);
|
||||||
|
#else
|
||||||
tag = (BufferTag) {
|
tag = (BufferTag) {
|
||||||
.rnode = reln->smgr_rnode.node,
|
.rnode = rnode,
|
||||||
.forkNum = forknum,
|
.forkNum = forknum,
|
||||||
.blockNum = blocknum
|
.blockNum = blocknum
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
ring_index = prefetch_register_buffer(tag, NULL, NULL);
|
ring_index = prefetch_register_buffer(tag, NULL, NULL);
|
||||||
|
|
||||||
Assert(ring_index < MyPState->ring_unused &&
|
Assert(ring_index < MyPState->ring_unused &&
|
||||||
@@ -1861,11 +1893,15 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
PrfHashEntry *entry;
|
PrfHashEntry *entry;
|
||||||
PrefetchRequest *slot;
|
PrefetchRequest *slot;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&buftag, &rnode, forkNum, blkno);
|
||||||
|
#else
|
||||||
buftag = (BufferTag) {
|
buftag = (BufferTag) {
|
||||||
.rnode = rnode,
|
.rnode = rnode,
|
||||||
.forkNum = forkNum,
|
.forkNum = forkNum,
|
||||||
.blockNum = blkno,
|
.blockNum = blkno
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The redo process does not lock pages that it needs to replay but are
|
* The redo process does not lock pages that it needs to replay but are
|
||||||
@@ -1965,9 +2001,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
(errcode(ERRCODE_IO_ERROR),
|
(errcode(ERRCODE_IO_ERROR),
|
||||||
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||||
blkno,
|
blkno,
|
||||||
rnode.spcNode,
|
RnodeGetSpcOid(rnode),
|
||||||
rnode.dbNode,
|
RnodeGetDbOid(rnode),
|
||||||
rnode.relNode,
|
RnodeGetRelNumber(rnode),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||||
errdetail("page server returned error: %s",
|
errdetail("page server returned error: %s",
|
||||||
@@ -1991,6 +2027,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
bool latest;
|
bool latest;
|
||||||
XLogRecPtr request_lsn;
|
XLogRecPtr request_lsn;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -2010,13 +2047,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Try to read from local file cache */
|
/* Try to read from local file cache */
|
||||||
if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
|
if (lfc_read(RelnGetRnode(reln), forkNum, blkno, buffer))
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
|
request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, blkno);
|
||||||
neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
|
neon_read_at_lsn(rnode, forkNum, blkno, request_lsn, latest, buffer);
|
||||||
|
|
||||||
#ifdef DEBUG_COMPARE_LOCAL
|
#ifdef DEBUG_COMPARE_LOCAL
|
||||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||||
@@ -2036,9 +2073,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||||
blkno,
|
blkno,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
hexdump_page(buffer));
|
hexdump_page(buffer));
|
||||||
@@ -2048,9 +2085,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||||
blkno,
|
blkno,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
hexdump_page(mdbuf));
|
hexdump_page(mdbuf));
|
||||||
@@ -2065,9 +2102,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||||
blkno,
|
blkno,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
hexdump_page(mdbuf_masked),
|
hexdump_page(mdbuf_masked),
|
||||||
@@ -2086,9 +2123,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
{
|
{
|
||||||
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||||
blkno,
|
blkno,
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forkNum,
|
forkNum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
hexdump_page(mdbuf_masked),
|
hexdump_page(mdbuf_masked),
|
||||||
@@ -2133,7 +2170,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
|||||||
char *buffer, bool skipFsync)
|
char *buffer, bool skipFsync)
|
||||||
{
|
{
|
||||||
XLogRecPtr lsn;
|
XLogRecPtr lsn;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
@@ -2170,13 +2207,12 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
|||||||
|
|
||||||
lsn = PageGetLSN(buffer);
|
lsn = PageGetLSN(buffer);
|
||||||
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
|
||||||
forknum, blocknum,
|
forknum, blocknum,
|
||||||
(uint32) (lsn >> 32), (uint32) lsn);
|
(uint32) (lsn >> 32), (uint32) lsn);
|
||||||
|
|
||||||
lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
|
lfc_write(rnode, forknum, blocknum, buffer);
|
||||||
|
|
||||||
#ifdef DEBUG_COMPARE_LOCAL
|
#ifdef DEBUG_COMPARE_LOCAL
|
||||||
if (IS_LOCAL_REL(reln))
|
if (IS_LOCAL_REL(reln))
|
||||||
@@ -2194,6 +2230,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
BlockNumber n_blocks;
|
BlockNumber n_blocks;
|
||||||
bool latest;
|
bool latest;
|
||||||
XLogRecPtr request_lsn;
|
XLogRecPtr request_lsn;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -2212,23 +2249,23 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
|
if (get_cached_relsize(RelnGetRnode(reln), forknum, &n_blocks))
|
||||||
{
|
{
|
||||||
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum, n_blocks);
|
forknum, n_blocks);
|
||||||
return n_blocks;
|
return n_blocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO);
|
request_lsn = neon_get_request_lsn(&latest, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||||
{
|
{
|
||||||
NeonNblocksRequest request = {
|
NeonNblocksRequest request = {
|
||||||
.req.tag = T_NeonNblocksRequest,
|
.req.tag = T_NeonNblocksRequest,
|
||||||
.req.latest = latest,
|
.req.latest = latest,
|
||||||
.req.lsn = request_lsn,
|
.req.lsn = request_lsn,
|
||||||
.rnode = reln->smgr_rnode.node,
|
.rnode = rnode,
|
||||||
.forknum = forknum,
|
.forknum = forknum,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -2245,9 +2282,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_IO_ERROR),
|
(errcode(ERRCODE_IO_ERROR),
|
||||||
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum,
|
forknum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||||
errdetail("page server returned error: %s",
|
errdetail("page server returned error: %s",
|
||||||
@@ -2257,12 +2294,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
default:
|
default:
|
||||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||||
}
|
}
|
||||||
update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
|
update_cached_relsize(rnode, forknum, n_blocks);
|
||||||
|
|
||||||
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
|
||||||
forknum,
|
forknum,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
n_blocks);
|
n_blocks);
|
||||||
@@ -2275,7 +2311,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
* neon_db_size() -- Get the size of the database in bytes.
|
* neon_db_size() -- Get the size of the database in bytes.
|
||||||
*/
|
*/
|
||||||
int64
|
int64
|
||||||
neon_dbsize(Oid dbNode)
|
neon_dbsize(Oid dbOid)
|
||||||
{
|
{
|
||||||
NeonResponse *resp;
|
NeonResponse *resp;
|
||||||
int64 db_size;
|
int64 db_size;
|
||||||
@@ -2289,7 +2325,7 @@ neon_dbsize(Oid dbNode)
|
|||||||
.req.tag = T_NeonDbSizeRequest,
|
.req.tag = T_NeonDbSizeRequest,
|
||||||
.req.latest = latest,
|
.req.latest = latest,
|
||||||
.req.lsn = request_lsn,
|
.req.lsn = request_lsn,
|
||||||
.dbNode = dbNode,
|
.dbOid = dbOid,
|
||||||
};
|
};
|
||||||
|
|
||||||
resp = page_server_request(&request);
|
resp = page_server_request(&request);
|
||||||
@@ -2305,7 +2341,7 @@ neon_dbsize(Oid dbNode)
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_IO_ERROR),
|
(errcode(ERRCODE_IO_ERROR),
|
||||||
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
|
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
|
||||||
dbNode,
|
dbOid,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||||
errdetail("page server returned error: %s",
|
errdetail("page server returned error: %s",
|
||||||
((NeonErrorResponse *) resp)->message)));
|
((NeonErrorResponse *) resp)->message)));
|
||||||
@@ -2316,7 +2352,7 @@ neon_dbsize(Oid dbNode)
|
|||||||
}
|
}
|
||||||
|
|
||||||
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||||
dbNode,
|
dbOid,
|
||||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||||
db_size);
|
db_size);
|
||||||
|
|
||||||
@@ -2331,6 +2367,7 @@ void
|
|||||||
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||||
{
|
{
|
||||||
XLogRecPtr lsn;
|
XLogRecPtr lsn;
|
||||||
|
RelFileNode rnode = RelnGetRnode(reln);
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -2350,7 +2387,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||||
}
|
}
|
||||||
|
|
||||||
set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
|
set_cached_relsize(rnode, forknum, nblocks);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Truncating a relation drops all its buffers from the buffer cache
|
* Truncating a relation drops all its buffers from the buffer cache
|
||||||
@@ -2378,7 +2415,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
* for the extended pages, so there's no harm in leaving behind obsolete
|
* for the extended pages, so there's no harm in leaving behind obsolete
|
||||||
* entries for the truncated chunks.
|
* entries for the truncated chunks.
|
||||||
*/
|
*/
|
||||||
SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum);
|
SetLastWrittenLSNForRelation(lsn, rnode, forknum);
|
||||||
|
|
||||||
#ifdef DEBUG_COMPARE_LOCAL
|
#ifdef DEBUG_COMPARE_LOCAL
|
||||||
if (IS_LOCAL_REL(reln))
|
if (IS_LOCAL_REL(reln))
|
||||||
@@ -2448,9 +2485,9 @@ neon_start_unlogged_build(SMgrRelation reln)
|
|||||||
|
|
||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("starting unlogged build of relation %u/%u/%u",
|
(errmsg("starting unlogged build of relation %u/%u/%u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode)));
|
RelnGetRelNumber(reln))));
|
||||||
|
|
||||||
switch (reln->smgr_relpersistence)
|
switch (reln->smgr_relpersistence)
|
||||||
{
|
{
|
||||||
@@ -2500,9 +2537,9 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
|||||||
|
|
||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
|
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode)));
|
RelnGetRelNumber(reln))));
|
||||||
|
|
||||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||||
return;
|
return;
|
||||||
@@ -2529,9 +2566,9 @@ neon_end_unlogged_build(SMgrRelation reln)
|
|||||||
|
|
||||||
ereport(SmgrTrace,
|
ereport(SmgrTrace,
|
||||||
(errmsg("ending unlogged build of relation %u/%u/%u",
|
(errmsg("ending unlogged build of relation %u/%u/%u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode)));
|
RelnGetRelNumber(reln))));
|
||||||
|
|
||||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||||
{
|
{
|
||||||
@@ -2544,16 +2581,24 @@ neon_end_unlogged_build(SMgrRelation reln)
|
|||||||
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
|
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
|
||||||
|
|
||||||
/* Remove local copy */
|
/* Remove local copy */
|
||||||
rnode = reln->smgr_rnode;
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
rnode.locator = RelnGetRnode(reln);
|
||||||
|
#else
|
||||||
|
rnode.node = RelnGetRnode(reln);
|
||||||
|
#endif
|
||||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
{
|
{
|
||||||
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||||
rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum);
|
forknum);
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
forget_cached_relsize(rnode.locator, forknum);
|
||||||
|
#else
|
||||||
forget_cached_relsize(rnode.node, forknum);
|
forget_cached_relsize(rnode.node, forknum);
|
||||||
|
#endif
|
||||||
mdclose(reln, forknum);
|
mdclose(reln, forknum);
|
||||||
/* use isRedo == true, so that we drop it immediately */
|
/* use isRedo == true, so that we drop it immediately */
|
||||||
mdunlink(rnode, forknum, true);
|
mdunlink(rnode, forknum, true);
|
||||||
@@ -2706,10 +2751,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
|||||||
* regardless of whether the block is stored in shared buffers.
|
* regardless of whether the block is stored in shared buffers.
|
||||||
* See also this function's top comment.
|
* See also this function's top comment.
|
||||||
*/
|
*/
|
||||||
if (!OidIsValid(rnode.dbNode))
|
|
||||||
|
if (!OidIsValid(RnodeGetDbOid(rnode)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&tag, &rnode, forknum, blkno);
|
||||||
|
#else
|
||||||
INIT_BUFFERTAG(tag, rnode, forknum, blkno);
|
INIT_BUFFERTAG(tag, rnode, forknum, blkno);
|
||||||
|
#endif
|
||||||
|
|
||||||
hash = BufTableHashCode(&tag);
|
hash = BufTableHashCode(&tag);
|
||||||
partitionLock = BufMappingPartitionLock(hash);
|
partitionLock = BufMappingPartitionLock(hash);
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,11 @@
|
|||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
#include "pagestore_client.h"
|
#include "pagestore_client.h"
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "storage/relfilelocator.h"
|
||||||
|
#else
|
||||||
#include "storage/relfilenode.h"
|
#include "storage/relfilenode.h"
|
||||||
|
#endif
|
||||||
#include "storage/smgr.h"
|
#include "storage/smgr.h"
|
||||||
#include "storage/lwlock.h"
|
#include "storage/lwlock.h"
|
||||||
#include "storage/ipc.h"
|
#include "storage/ipc.h"
|
||||||
@@ -28,6 +32,7 @@
|
|||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
RelFileNode rnode;
|
RelFileNode rnode;
|
||||||
|
|||||||
@@ -1394,7 +1394,12 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
|
|||||||
WalReceiverConn *wrconn;
|
WalReceiverConn *wrconn;
|
||||||
WalRcvStreamOptions options;
|
WalRcvStreamOptions options;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
bool must_use_password = false;
|
||||||
|
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, must_use_password, "wal_proposer_recovery", &err);
|
||||||
|
#else
|
||||||
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
|
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
|
||||||
|
#endif
|
||||||
if (!wrconn)
|
if (!wrconn)
|
||||||
{
|
{
|
||||||
ereport(WARNING,
|
ereport(WARNING,
|
||||||
@@ -2231,6 +2236,18 @@ HandleSafekeeperResponse(void)
|
|||||||
if (n_synced >= quorum)
|
if (n_synced >= quorum)
|
||||||
{
|
{
|
||||||
/* All safekeepers synced! */
|
/* All safekeepers synced! */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send empty message to broadcast latest truncateLsn to all safekeepers.
|
||||||
|
* This helps to finish next sync-safekeepers eailier, by skipping recovery
|
||||||
|
* step.
|
||||||
|
*
|
||||||
|
* We don't need to wait for response because it doesn't affect correctness,
|
||||||
|
* and TCP should be able to deliver the message to safekeepers in case of
|
||||||
|
* network working properly.
|
||||||
|
*/
|
||||||
|
BroadcastAppendRequest();
|
||||||
|
|
||||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
|
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,10 @@
|
|||||||
#include "access/xlogrecovery.h"
|
#include "access/xlogrecovery.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "utils/guc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These variables are used similarly to openLogFile/SegNo,
|
* These variables are used similarly to openLogFile/SegNo,
|
||||||
* but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
|
* but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
|
||||||
|
|||||||
@@ -128,7 +128,11 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
|
|||||||
else
|
else
|
||||||
isvalid = false;
|
isvalid = false;
|
||||||
bufferid = BufferDescriptorGetBuffer(bufHdr);
|
bufferid = BufferDescriptorGetBuffer(bufHdr);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
rnode = BufTagGetRelFileLocator(&bufHdr->tag);
|
||||||
|
#else
|
||||||
rnode = bufHdr->tag.rnode;
|
rnode = bufHdr->tag.rnode;
|
||||||
|
#endif
|
||||||
forknum = bufHdr->tag.forkNum;
|
forknum = bufHdr->tag.forkNum;
|
||||||
blocknum = bufHdr->tag.blockNum;
|
blocknum = bufHdr->tag.blockNum;
|
||||||
|
|
||||||
@@ -238,7 +242,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
|
|||||||
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
|
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
|
||||||
raw_page_data = VARDATA(raw_page);
|
raw_page_data = VARDATA(raw_page);
|
||||||
|
|
||||||
neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
|
neon_read_at_lsn(RelnGetRnode(RelationGetSmgr(rel)), forknum, blkno, read_lsn, request_latest, raw_page_data);
|
||||||
|
|
||||||
relation_close(rel, AccessShareLock);
|
relation_close(rel, AccessShareLock);
|
||||||
|
|
||||||
@@ -267,11 +271,17 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_NULL();
|
PG_RETURN_NULL();
|
||||||
|
|
||||||
{
|
{
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
RelFileLocator rnode = {
|
||||||
|
.spcOid = PG_GETARG_OID(0),
|
||||||
|
.dbOid = PG_GETARG_OID(1),
|
||||||
|
.relNumber = PG_GETARG_OID(2)};
|
||||||
|
#else
|
||||||
RelFileNode rnode = {
|
RelFileNode rnode = {
|
||||||
.spcNode = PG_GETARG_OID(0),
|
.spcNode = PG_GETARG_OID(0),
|
||||||
.dbNode = PG_GETARG_OID(1),
|
.dbNode = PG_GETARG_OID(1),
|
||||||
.relNode = PG_GETARG_OID(2)};
|
.relNode = PG_GETARG_OID(2)};
|
||||||
|
#endif
|
||||||
ForkNumber forknum = PG_GETARG_UINT32(3);
|
ForkNumber forknum = PG_GETARG_UINT32(3);
|
||||||
|
|
||||||
uint32 blkno = PG_GETARG_UINT32(4);
|
uint32 blkno = PG_GETARG_UINT32(4);
|
||||||
|
|||||||
@@ -21,7 +21,6 @@
|
|||||||
#include "access/xlog.h"
|
#include "access/xlog.h"
|
||||||
#include "storage/block.h"
|
#include "storage/block.h"
|
||||||
#include "storage/buf_internals.h"
|
#include "storage/buf_internals.h"
|
||||||
#include "storage/relfilenode.h"
|
|
||||||
#include "storage/smgr.h"
|
#include "storage/smgr.h"
|
||||||
|
|
||||||
#if PG_VERSION_NUM >= 150000
|
#if PG_VERSION_NUM >= 150000
|
||||||
@@ -30,6 +29,7 @@
|
|||||||
|
|
||||||
#include "inmem_smgr.h"
|
#include "inmem_smgr.h"
|
||||||
|
|
||||||
|
|
||||||
/* Size of the in-memory smgr */
|
/* Size of the in-memory smgr */
|
||||||
#define MAX_PAGES 64
|
#define MAX_PAGES 64
|
||||||
|
|
||||||
@@ -46,12 +46,22 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
|
|||||||
/* We only hold a small number of pages, so linear search */
|
/* We only hold a small number of pages, so linear search */
|
||||||
for (int i = 0; i < used_pages; i++)
|
for (int i = 0; i < used_pages; i++)
|
||||||
{
|
{
|
||||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
|
||||||
&& forknum == page_tag[i].forkNum
|
&& forknum == page_tag[i].forkNum
|
||||||
&& blkno == page_tag[i].blockNum)
|
&& blkno == page_tag[i].blockNum)
|
||||||
{
|
{
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
|
||||||
|
&& forknum == page_tag[i].forkNum
|
||||||
|
&& blkno == page_tag[i].blockNum)
|
||||||
|
{
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@@ -97,8 +107,12 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < used_pages; i++)
|
for (int i = 0; i < used_pages; i++)
|
||||||
{
|
{
|
||||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
#if PG_VERSION_NUM >= 160000
|
||||||
&& forknum == page_tag[i].forkNum)
|
if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
|
||||||
|
#else
|
||||||
|
if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
|
||||||
|
#endif
|
||||||
|
&& forknum == page_tag[i].forkNum)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -216,9 +230,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
|||||||
*/
|
*/
|
||||||
elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
|
elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
|
||||||
"inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
|
"inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum,
|
forknum,
|
||||||
blocknum,
|
blocknum,
|
||||||
used_pages);
|
used_pages);
|
||||||
@@ -227,14 +241,19 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
|||||||
|
|
||||||
pg = used_pages;
|
pg = used_pages;
|
||||||
used_pages++;
|
used_pages++;
|
||||||
INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&page_tag[pg], &RelnGetRnode(reln), forknum, blocknum);
|
||||||
|
#else
|
||||||
|
INIT_BUFFERTAG(page_tag[pg], RelnGetRnode(reln), forknum, blocknum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
|
elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
|
||||||
reln->smgr_rnode.node.spcNode,
|
RelnGetSpcOid(reln),
|
||||||
reln->smgr_rnode.node.dbNode,
|
RelnGetDbOid(reln),
|
||||||
reln->smgr_rnode.node.relNode,
|
RelnGetRelNumber(reln),
|
||||||
forknum,
|
forknum,
|
||||||
blocknum,
|
blocknum,
|
||||||
used_pages);
|
used_pages);
|
||||||
|
|||||||
@@ -11,6 +11,40 @@
|
|||||||
#ifndef INMEM_SMGR_H
|
#ifndef INMEM_SMGR_H
|
||||||
#define INMEM_SMGR_H
|
#define INMEM_SMGR_H
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#include "storage/relfilelocator.h"
|
||||||
|
#else
|
||||||
|
#include "storage/relfilenode.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// This is a hack to avoid too many ifdefs in the function definitions.
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
typedef RelFileLocator RelFileNode;
|
||||||
|
typedef RelFileLocatorBackend RelFileNodeBackend;
|
||||||
|
#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
|
||||||
|
#define RnodeGetSpcOid(rnode) (rnode.spcOid)
|
||||||
|
#define RnodeGetDbOid(rnode) (rnode.dbOid)
|
||||||
|
#define RnodeGetRelNumber(rnode) (rnode.relNumber)
|
||||||
|
|
||||||
|
#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
|
||||||
|
#else
|
||||||
|
#define RelnGetRnode(reln) (reln->smgr_rnode.node)
|
||||||
|
#define RnodeGetSpcOid(rnode) (rnode.spcNode)
|
||||||
|
#define RnodeGetDbOid(rnode) (rnode.dbNode)
|
||||||
|
#define RnodeGetRelNumber(rnode) (rnode.relNode)
|
||||||
|
|
||||||
|
#define BufTagGetRnode(tag) (tag.rnode)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||||
|
#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
|
||||||
|
#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||||
|
|
||||||
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
||||||
extern void smgr_init_inmem(void);
|
extern void smgr_init_inmem(void);
|
||||||
|
|
||||||
|
|||||||
@@ -62,8 +62,10 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_GETRUSAGE
|
#ifndef HAVE_GETRUSAGE
|
||||||
|
#if PG_VERSION_NUM < 160000
|
||||||
#include "rusagestub.h"
|
#include "rusagestub.h"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "access/clog.h"
|
#include "access/clog.h"
|
||||||
#include "access/commit_ts.h"
|
#include "access/commit_ts.h"
|
||||||
@@ -117,6 +119,7 @@
|
|||||||
#include "neon_seccomp.h"
|
#include "neon_seccomp.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
|
|
||||||
static int ReadRedoCommand(StringInfo inBuf);
|
static int ReadRedoCommand(StringInfo inBuf);
|
||||||
@@ -662,18 +665,31 @@ BeginRedoForBlock(StringInfo input_message)
|
|||||||
* BlockNumber
|
* BlockNumber
|
||||||
*/
|
*/
|
||||||
forknum = pq_getmsgbyte(input_message);
|
forknum = pq_getmsgbyte(input_message);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||||
|
#else
|
||||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||||
|
#endif
|
||||||
blknum = pq_getmsgint(input_message, 4);
|
blknum = pq_getmsgint(input_message, 4);
|
||||||
wal_redo_buffer = InvalidBuffer;
|
wal_redo_buffer = InvalidBuffer;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
InitBufferTag(&target_redo_tag, &rnode, forknum, blknum);
|
||||||
|
#else
|
||||||
INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
|
INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
|
elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
|
||||||
target_redo_tag.rnode.spcNode,
|
#if PG_VERSION_NUM >= 160000
|
||||||
target_redo_tag.rnode.dbNode,
|
target_redo_tag.spcOid, target_redo_tag.dbOid, target_redo_tag.relNumber,
|
||||||
target_redo_tag.rnode.relNode,
|
#else
|
||||||
|
target_redo_tag.rnode.spcNode, target_redo_tag.rnode.dbNode, target_redo_tag.rnode.relNode,
|
||||||
|
#endif
|
||||||
target_redo_tag.forkNum,
|
target_redo_tag.forkNum,
|
||||||
target_redo_tag.blockNum);
|
target_redo_tag.blockNum);
|
||||||
|
|
||||||
@@ -709,9 +725,15 @@ PushPage(StringInfo input_message)
|
|||||||
* 8k page content
|
* 8k page content
|
||||||
*/
|
*/
|
||||||
forknum = pq_getmsgbyte(input_message);
|
forknum = pq_getmsgbyte(input_message);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||||
|
#else
|
||||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||||
|
#endif
|
||||||
blknum = pq_getmsgint(input_message, 4);
|
blknum = pq_getmsgint(input_message, 4);
|
||||||
content = pq_getmsgbytes(input_message, BLCKSZ);
|
content = pq_getmsgbytes(input_message, BLCKSZ);
|
||||||
|
|
||||||
@@ -831,7 +853,12 @@ ApplyRecord(StringInfo input_message)
|
|||||||
*/
|
*/
|
||||||
if (BufferIsInvalid(wal_redo_buffer))
|
if (BufferIsInvalid(wal_redo_buffer))
|
||||||
{
|
{
|
||||||
wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
|
wal_redo_buffer = NeonRedoReadBuffer(
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
BufTagGetRelFileLocator(&target_redo_tag),
|
||||||
|
#else
|
||||||
|
target_redo_tag.rnode,
|
||||||
|
#endif
|
||||||
target_redo_tag.forkNum,
|
target_redo_tag.forkNum,
|
||||||
target_redo_tag.blockNum,
|
target_redo_tag.blockNum,
|
||||||
RBM_NORMAL);
|
RBM_NORMAL);
|
||||||
@@ -873,12 +900,43 @@ apply_error_callback(void *arg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||||
{
|
{
|
||||||
BufferTag target_tag;
|
BufferTag target_tag;
|
||||||
|
|
||||||
|
RelFileLocator rlocator;
|
||||||
|
XLogRecGetBlockTag(record, block_id,
|
||||||
|
&rlocator, &target_tag.forkNum, &target_tag.blockNum);
|
||||||
|
|
||||||
|
target_tag.spcOid = rlocator.spcOid;
|
||||||
|
target_tag.dbOid = rlocator.dbOid;
|
||||||
|
target_tag.relNumber = rlocator.relNumber;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Can a WAL redo function ever access a relation other than the one that
|
||||||
|
* it modifies? I don't see why it would.
|
||||||
|
*/
|
||||||
|
if (RelFileLocatorEquals(BufTagGetRelFileLocator(&target_tag), BufTagGetRelFileLocator(&target_redo_tag)))
|
||||||
|
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
||||||
|
target_tag.spcOid, target_tag.dbOid, target_tag.relNumber,
|
||||||
|
target_tag.forkNum, target_tag.blockNum);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this block isn't one we are currently restoring, then return 'true'
|
||||||
|
* so that this gets ignored
|
||||||
|
*/
|
||||||
|
return !BufferTagsEqual(&target_tag, &target_redo_tag);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static bool
|
||||||
|
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||||
|
{
|
||||||
|
BufferTag target_tag;
|
||||||
|
|
||||||
|
|
||||||
#if PG_VERSION_NUM >= 150000
|
#if PG_VERSION_NUM >= 150000
|
||||||
XLogRecGetBlockTag(record, block_id,
|
XLogRecGetBlockTag(record, block_id,
|
||||||
&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
|
&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
|
||||||
@@ -897,14 +955,18 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
|
|||||||
*/
|
*/
|
||||||
if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
|
if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
|
||||||
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
||||||
target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
|
target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode,
|
||||||
|
target_tag.forkNum, target_tag.blockNum);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this block isn't one we are currently restoring, then return 'true'
|
* If this block isn't one we are currently restoring, then return 'true'
|
||||||
* so that this gets ignored
|
* so that this gets ignored
|
||||||
*/
|
*/
|
||||||
|
|
||||||
return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
|
return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get a page image back from buffer cache.
|
* Get a page image back from buffer cache.
|
||||||
@@ -931,9 +993,15 @@ GetPage(StringInfo input_message)
|
|||||||
* BlockNumber
|
* BlockNumber
|
||||||
*/
|
*/
|
||||||
forknum = pq_getmsgbyte(input_message);
|
forknum = pq_getmsgbyte(input_message);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||||
|
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||||
|
#else
|
||||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||||
|
#endif
|
||||||
blknum = pq_getmsgint(input_message, 4);
|
blknum = pq_getmsgint(input_message, 4);
|
||||||
|
|
||||||
/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
|
/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
|
||||||
@@ -961,7 +1029,11 @@ GetPage(StringInfo input_message)
|
|||||||
} while (tot_written < BLCKSZ);
|
} while (tot_written < BLCKSZ);
|
||||||
|
|
||||||
ReleaseBuffer(buf);
|
ReleaseBuffer(buf);
|
||||||
|
#if PG_VERSION_NUM >= 160000
|
||||||
|
DropRelationAllLocalBuffers(rnode);
|
||||||
|
#else
|
||||||
DropRelFileNodeAllLocalBuffers(rnode);
|
DropRelFileNodeAllLocalBuffers(rnode);
|
||||||
|
#endif
|
||||||
wal_redo_buffer = InvalidBuffer;
|
wal_redo_buffer = InvalidBuffer;
|
||||||
|
|
||||||
elog(TRACE, "Page sent back for block %u", blknum);
|
elog(TRACE, "Page sent back for block %u", blknum);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user